From 4730faca618ff9cee0780580145e3cbe86f24876 Mon Sep 17 00:00:00 2001 From: Austin <77757836+teleprint-me@users.noreply.github.com> Date: Sun, 28 Jul 2024 03:52:42 -0400 Subject: [PATCH 01/28] chore : Fix vulkan related compiler warnings, add help text, improve CLI options (#8477) * chore: Fix compiler warnings, add help text, improve CLI options * Add prototypes for function definitions * Invert logic of --no-clean option to be more intuitive * Provide a new help prompt with clear instructions * chore : Add ignore rule for vulkan shader generator Signed-off-by: teleprint-me <77757836+teleprint-me@users.noreply.github.com> * Update ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp Co-authored-by: 0cc4m * chore : Remove void and apply C++ style empty parameters * chore : Remove void and apply C++ style empty parameters --------- Signed-off-by: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Co-authored-by: 0cc4m --- .gitignore | 1 + .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 33 +++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7c7dee0c6..c9b4d9983 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ build* !docs/build.md /libllama.so /llama-* +/vulkan-shaders-gen android-ndk-* arm_neon.h cmake-build-* diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index c5be3754b..c9dbf9dfd 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,6 +30,20 @@ #define ASYNCIO_CONCURRENCY 64 +// define prototypes +void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str); +bool directory_exists(const std::string& path); +bool create_directory(const std::string& path); +std::string to_uppercase(const std::string& input); +bool string_ends_with(const std::string& str, const std::string& suffix); +std::string join_paths(const std::string& path1, const std::string& path2); +std::string basename(const std::string &path); +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16); +std::map merge_maps(const std::map& a, const std::map& b); +void matmul_shaders(std::vector>& tasks, bool fp16, bool matmul_id); +void process_shaders(std::vector>& tasks); +void write_output_files(); + std::mutex lock; std::vector> shader_fnames; @@ -38,7 +52,7 @@ std::string input_dir = "vulkan-shaders"; std::string output_dir = "/tmp"; std::string target_hpp = "ggml-vulkan-shaders.hpp"; std::string target_cpp = "ggml-vulkan-shaders.cpp"; -bool no_clean = false; +bool clean = true; const std::vector type_names = { "f32", @@ -464,8 +478,9 @@ void write_output_files() { } fprintf(src, "\n};\n\n"); - if (!no_clean) { + if (clean) { std::remove(path.c_str()); + // fprintf(stderr, "Removed: %s\n", path.c_str()); } } @@ -481,6 +496,18 @@ int main(int argc, char** argv) { } } + if (argc <= 1 || args.find("--help") != args.end()) { + std::cout << "Usage:\n" + "\tvulkan-shaders-gen [options]\n\n" + "Options:\n" + "\t--glslc Path to glslc executable (default: /usr/bin/glslc)\n" + "\t--input-dir Directory containing shader sources (required)\n" + "\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n" + "\t--target-hpp Path to generate a header file with shader declarations in C++ format\n" + "\t--target-cpp Path to generate a source code file implementing the declared shaders (optional)\n" + "\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n"; + return EXIT_SUCCESS; + } if (args.find("--glslc") != args.end()) { GLSLC = args["--glslc"]; // Path to glslc } @@ -497,7 +524,7 @@ int main(int argc, char** argv) { target_cpp = args["--target-cpp"]; // Path to generated cpp file } if (args.find("--no-clean") != args.end()) { - no_clean = true; // Keep temporary SPIR-V files in output-dir after build + clean = false; // Keep temporary SPIR-V files in output-dir after build } if (!directory_exists(input_dir)) { From 6eeaeba126ff701f3e8f79f246805b7023709972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 28 Jul 2024 22:32:44 +0200 Subject: [PATCH 02/28] cmake: use 1 more thread for non-ggml in CI (#8740) --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a1e183d11..b9246659a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,7 +860,8 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON - cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name id: tag From 0832de723695ab400316a6c49b9f712380e3a731 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Mon, 29 Jul 2024 10:50:27 +0800 Subject: [PATCH 03/28] [SYCL] add conv support (#8688) --- ggml/src/ggml-sycl.cpp | 12 +++++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/conv.cpp | 99 ++++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/conv.hpp | 21 ++++++++ ggml/src/ggml-sycl/presets.hpp | 1 + 5 files changed, 134 insertions(+) create mode 100644 ggml/src/ggml-sycl/conv.cpp create mode 100644 ggml/src/ggml-sycl/conv.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 7cb07d0dc..d1dd07f64 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_func_t func; switch (tensor->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + func = ggml_sycl_op_conv_transpose_1d; + break; case GGML_OP_REPEAT: func = ggml_sycl_repeat; break; @@ -5090,6 +5093,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 067181de3..98b0ebc19 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -15,6 +15,7 @@ #include "concat.hpp" #include "common.hpp" +#include "conv.hpp" #include "convert.hpp" #include "dequantize.hpp" #include "dmmv.hpp" diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp new file mode 100644 index 000000000..bc4ab1ddb --- /dev/null +++ b/ggml/src/ggml-sycl/conv.cpp @@ -0,0 +1,99 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "conv.hpp" + +static void conv_transpose_1d_kernel( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float * src0, const float * src1, float * dst, + const sycl::nd_item<3> &item_ct1) { + int global_index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (global_index >= output_size) { + return; + } + + int out_index = global_index / dst_ne0; + + float accumulator = 0; + + for (int c = 0; c < src0_ne2; c++) { + int idx = global_index % dst_ne0; + + int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); + int input_offset = src1_ne0 * c; + + for (int i = 0; i < src1_ne0; i++) { + if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { + continue; + } + int weight_idx = idx - i*s0; + + float kernel_weight = src0[kernel_offset + weight_idx]; + float input_value = src1[input_offset+i]; + + accumulator += kernel_weight * input_value; + } + } + dst[global_index] = accumulator; +} + +static void conv_transpose_1d_f32_f32_sycl( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float *src0, const float *src1, float *dst, + const queue_ptr& stream) { + + const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv_transpose_1d_kernel( + s0, output_size, + src0_ne0, src0_ne1, src0_ne2, + src1_ne0, dst_ne0, + src0, src1, dst, item_ct1); + }); +} + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst) { + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *)dst->op_params; + + const int s0 = opts[0]; + + const int64_t output_size = ggml_nelements(dst); + + conv_transpose_1d_f32_f32_sycl(s0, output_size, + src0->ne[0], src0->ne[1], src0->ne[2], + src1->ne[0], dst->ne[0], + src0_d, src1_d, dst_d, stream); +} + diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp new file mode 100644 index 000000000..eb20730f9 --- /dev/null +++ b/ggml/src/ggml-sycl/conv.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONV_HPP +#define GGML_SYCL_CONV_HPP + +#include "common.hpp" + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst); + +#endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 15ddcac1f..479789626 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -41,6 +41,7 @@ #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 +#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X From 439b3fc75a8deb42899ac47a8f52aae75e0339fe Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Mon, 29 Jul 2024 20:56:12 +0800 Subject: [PATCH 04/28] cuda : organize vendor-specific headers into vendors directory (#8746) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/common.cuh | 378 +----------------------------- ggml/src/ggml-cuda/vendors/cuda.h | 14 ++ ggml/src/ggml-cuda/vendors/hip.h | 177 ++++++++++++++ ggml/src/ggml-cuda/vendors/musa.h | 171 ++++++++++++++ 4 files changed, 366 insertions(+), 374 deletions(-) create mode 100644 ggml/src/ggml-cuda/vendors/cuda.h create mode 100644 ggml/src/ggml-cuda/vendors/hip.h create mode 100644 ggml/src/ggml-cuda/vendors/musa.h diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 8c3c20b90..eb39b6d23 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -27,255 +27,11 @@ #include #if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap() do { abort(); __builtin_unreachable(); } while(0) -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#include "vendors/hip.h" #elif defined(GGML_USE_MUSA) -#include -#include -#include -#include -// XXX: Keep the following order the same as hipBLAS -// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F -// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F -#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F -#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N MUBLAS_OP_N -#define CUBLAS_OP_T MUBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -// #define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F MUSA_R_16F -#define CUDA_R_32F MUSA_R_32F -// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -// #define cublasComputeType_t mublasComputeType_t -#define cublasCreate mublasCreate -#define cublasDestroy mublasDestroy -#define cublasGemmEx mublasGemmEx -#define cublasGemmBatchedEx mublasGemmBatchedEx -#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx -#define cublasHandle_t mublasHandle_t -// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetMathMode mublasSetMathMode -#define cublasSetStream mublasSetStream -#define cublasSgemm mublasSgemm -#define cublasStatus_t mublasStatus_t -#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess -#define cudaDeviceProp musaDeviceProp -#define cudaDeviceSynchronize musaDeviceSynchronize -#define cudaError_t musaError_t -#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags musaEventCreateWithFlags -#define cudaEventDisableTiming musaEventDisableTiming -#define cudaEventRecord musaEventRecord -#define cudaEventSynchronize musaEventSynchronize -#define cudaEvent_t musaEvent_t -#define cudaEventDestroy musaEventDestroy -#define cudaFree musaFree -#define cudaFreeHost musaFreeHost -#define cudaGetDevice musaGetDevice -#define cudaGetDeviceCount musaGetDeviceCount -#define cudaGetDeviceProperties musaGetDeviceProperties -#define cudaGetErrorString musaGetErrorString -#define cudaGetLastError musaGetLastError -#define cudaHostRegister musaHostRegister -#define cudaHostRegisterPortable musaHostRegisterPortable -#define cudaHostRegisterReadOnly musaHostRegisterReadOnly -#define cudaHostUnregister musaHostUnregister -#define cudaLaunchHostFunc musaLaunchHostFunc -#define cudaMalloc musaMalloc -#define cudaMallocHost musaMallocHost -#define cudaMemcpy musaMemcpy -#define cudaMemcpyAsync musaMemcpyAsync -#define cudaMemcpyPeerAsync musaMemcpyPeerAsync -#define cudaMemcpy2DAsync musaMemcpy2DAsync -#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost -#define cudaMemcpyHostToDevice musaMemcpyHostToDevice -#define cudaMemcpyKind musaMemcpyKind -#define cudaMemset musaMemset -#define cudaMemsetAsync musaMemsetAsync -#define cudaMemGetInfo musaMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize -#define cudaSetDevice musaSetDevice -#define cudaStreamCreateWithFlags musaStreamCreateWithFlags -#define cudaStreamDestroy musaStreamDestroy -#define cudaStreamFireAndForget musaStreamFireAndForget -#define cudaStreamNonBlocking musaStreamNonBlocking -#define cudaStreamPerThread musaStreamPerThread -#define cudaStreamSynchronize musaStreamSynchronize -#define cudaStreamWaitEvent musaStreamWaitEvent -#define cudaStream_t musaStream_t -#define cudaSuccess musaSuccess - -// XXX: Other CUDA => MUSA mapping -#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE -#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED -#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED -#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE -#define CUdevice MUdevice -#define CUdeviceptr MUdeviceptr -#define CUmemAccessDesc MUmemAccessDesc -#define CUmemAllocationProp MUmemAllocationProp -#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle -#define cuDeviceGet muDeviceGet -#define cuDeviceGetAttribute muDeviceGetAttribute -#define cuMemAddressFree muMemAddressFree -#define cuMemAddressReserve muMemAddressReserve -#define cuMemCreate muMemCreate -#define cuMemGetAllocationGranularity muMemGetAllocationGranularity -#define cuMemMap muMemMap -#define cuMemRelease muMemRelease -#define cuMemSetAccess muMemSetAccess -#define cuMemUnmap muMemUnmap -#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize -#define cudaFuncSetAttribute musaFuncSetAttribute -#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms -#define make_cudaExtent make_musaExtent -#define make_cudaPitchedPtr make_musaPitchedPtr - -// XXX: USE_CUDA_GRAPH -#define CUDA_SUCCESS MUSA_SUCCESS -#define CUresult MUresult -#define cuGetErrorString muGetErrorString -#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure -#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction -#define cudaGraphDestroy musaGraphDestroy -#define cudaGraphExecDestroy musaGraphExecDestroy -#define cudaGraphExec_t musaGraphExec_t -#define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult -#define cudaGraphGetNodes musaGraphGetNodes -#define cudaGraphInstantiate musaGraphInstantiate -#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams -#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams -#define cudaGraphLaunch musaGraphLaunch -#define cudaGraphNodeGetType musaGraphNodeGetType -#define cudaGraphNode_t musaGraphNode_t -#define cudaGraphNodeType musaGraphNodeType -#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel -#define cudaGraph_t musaGraph_t -#define cudaKernelNodeParams musaKernelNodeParams -#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed -#define cudaStreamEndCapture musaStreamEndCapture - -// XXX: cuBLAS => muBLAS mapping -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa +#include "vendors/musa.h" #else -#include -#include -#include -#include - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - +#include "vendors/cuda.h" #endif // defined(GGML_USE_HIPBLAS) #define STRINGIZE_IMPL(...) #__VA_ARGS__ @@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { -#ifndef GGML_USE_MUSA return cublasGetStatusString(err); -#else - return mublasStatus_to_string(err); -#endif // GGML_USE_MUSA } #else static const char * cublas_get_error_str(const cublasStatus_t err) { @@ -364,129 +116,7 @@ typedef half2 dfloat2; #else typedef float dfloat; // dequantize float typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -#if defined(GGML_USE_MUSA) -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} -#endif // defined(GGML_USE_MUSA) - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#if defined(__gfx1010__) || defined(__gfx1012__) -#define RDNA1 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} - -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -// __shfl_xor() for half2 was added in ROCm 5.6 -static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { - typedef union half2_b32 { - half2 val; - int b32; - } half2_b32_t; - half2_b32_t tmp; - tmp.val = var; - tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); - return tmp.val; -} -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -#endif // defined(GGML_USE_HIPBLAS) +#endif // GGML_CUDA_F16 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h new file mode 100644 index 000000000..db9f6a165 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include +#include +#include + +#if CUDART_VERSION < 11020 +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h new file mode 100644 index 000000000..d0c377255 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -0,0 +1,177 @@ +#pragma once + +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyPeerAsync hipMemcpyPeerAsync +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap() do { abort(); __builtin_unreachable(); } while(0) +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED + +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#if defined(__gfx1010__) || defined(__gfx1012__) +#define RDNA1 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __vsub4(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} + +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 +// __shfl_xor() for half2 was added in ROCm 5.6 +static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { + typedef union half2_b32 { + half2 val; + int b32; + } half2_b32_t; + half2_b32_t tmp; + tmp.val = var; + tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); + return tmp.val; +} +#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h new file mode 100644 index 000000000..e50a103ac --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -0,0 +1,171 @@ +#pragma once + +#include +#include +#include +#include +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F +#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N MUBLAS_OP_N +#define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUDA_R_16F MUSA_R_16F +#define CUDA_R_32F MUSA_R_32F +#define cublasComputeType_t cudaDataType_t +#define cublasCreate mublasCreate +#define cublasDestroy mublasDestroy +#define cublasGemmEx mublasGemmEx +#define cublasGemmBatchedEx mublasGemmBatchedEx +#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx +#define cublasHandle_t mublasHandle_t +#define cublasSetMathMode mublasSetMathMode +#define cublasSetStream mublasSetStream +#define cublasSgemm mublasSgemm +#define cublasStatus_t mublasStatus_t +#define cublasGetStatusString mublasStatus_to_string +#define cudaDataType_t musaDataType_t +#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess +#define cudaDeviceProp musaDeviceProp +#define cudaDeviceSynchronize musaDeviceSynchronize +#define cudaError_t musaError_t +#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags musaEventCreateWithFlags +#define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventRecord musaEventRecord +#define cudaEventSynchronize musaEventSynchronize +#define cudaEvent_t musaEvent_t +#define cudaEventDestroy musaEventDestroy +#define cudaFree musaFree +#define cudaFreeHost musaFreeHost +#define cudaGetDevice musaGetDevice +#define cudaGetDeviceCount musaGetDeviceCount +#define cudaGetDeviceProperties musaGetDeviceProperties +#define cudaGetErrorString musaGetErrorString +#define cudaGetLastError musaGetLastError +#define cudaHostRegister musaHostRegister +#define cudaHostRegisterPortable musaHostRegisterPortable +#define cudaHostRegisterReadOnly musaHostRegisterReadOnly +#define cudaHostUnregister musaHostUnregister +#define cudaLaunchHostFunc musaLaunchHostFunc +#define cudaMalloc musaMalloc +#define cudaMallocHost musaMallocHost +#define cudaMemcpy musaMemcpy +#define cudaMemcpyAsync musaMemcpyAsync +#define cudaMemcpyPeerAsync musaMemcpyPeerAsync +#define cudaMemcpy2DAsync musaMemcpy2DAsync +#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost +#define cudaMemcpyHostToDevice musaMemcpyHostToDevice +#define cudaMemcpyKind musaMemcpyKind +#define cudaMemset musaMemset +#define cudaMemsetAsync musaMemsetAsync +#define cudaMemGetInfo musaMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize +#define cudaSetDevice musaSetDevice +#define cudaStreamCreateWithFlags musaStreamCreateWithFlags +#define cudaStreamDestroy musaStreamDestroy +#define cudaStreamFireAndForget musaStreamFireAndForget +#define cudaStreamNonBlocking musaStreamNonBlocking +#define cudaStreamPerThread musaStreamPerThread +#define cudaStreamSynchronize musaStreamSynchronize +#define cudaStreamWaitEvent musaStreamWaitEvent +#define cudaStream_t musaStream_t +#define cudaSuccess musaSuccess + +// Additional mappings for MUSA virtual memory pool +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE +#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED +#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED +#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE +#define CUdevice MUdevice +#define CUdeviceptr MUdeviceptr +#define CUmemAccessDesc MUmemAccessDesc +#define CUmemAllocationProp MUmemAllocationProp +#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle +#define cuDeviceGet muDeviceGet +#define cuDeviceGetAttribute muDeviceGetAttribute +#define cuMemAddressFree muMemAddressFree +#define cuMemAddressReserve muMemAddressReserve +#define cuMemCreate muMemCreate +#define cuMemGetAllocationGranularity muMemGetAllocationGranularity +#define cuMemMap muMemMap +#define cuMemRelease muMemRelease +#define cuMemSetAccess muMemSetAccess +#define cuMemUnmap muMemUnmap +#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize +#define cudaFuncSetAttribute musaFuncSetAttribute +#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms +#define make_cudaExtent make_musaExtent +#define make_cudaPitchedPtr make_musaPitchedPtr + +// Additional mappings for MUSA graphs +#define CUDA_SUCCESS MUSA_SUCCESS +#define CUresult MUresult +#define cuGetErrorString muGetErrorString +#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure +#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction +#define cudaGraphDestroy musaGraphDestroy +#define cudaGraphExecDestroy musaGraphExecDestroy +#define cudaGraphExec_t musaGraphExec_t +#define cudaGraphExecUpdate musaGraphExecUpdate +#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult +#define cudaGraphGetNodes musaGraphGetNodes +#define cudaGraphInstantiate musaGraphInstantiate +#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams +#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams +#define cudaGraphLaunch musaGraphLaunch +#define cudaGraphNodeGetType musaGraphNodeGetType +#define cudaGraphNode_t musaGraphNode_t +#define cudaGraphNodeType musaGraphNodeType +#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel +#define cudaGraph_t musaGraph_t +#define cudaKernelNodeParams musaKernelNodeParams +#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed +#define cudaStreamEndCapture musaStreamEndCapture + +// XXX: Clang builtins mapping +#define __vsub4 __vsub4_musa +#define __vcmpeq4 __vcmpeq4_musa +#define __vcmpne4 __vcmpne4_musa + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); + +static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} From 75af08c475e285888f66556d0f459c533b7deb95 Mon Sep 17 00:00:00 2001 From: CarterLi999 <664681047@qq.com> Date: Tue, 30 Jul 2024 00:38:34 +0800 Subject: [PATCH 05/28] ggml: bugfix: fix the inactive elements is agnostic for risc-v vector (#8748) In these codes, we want to retain the value that they previously held when mask[i] is false. So we should use undisturbed. With the default agnostic policy of rvv intrinsic, these values can be held or be written with 1s. Co-authored-by: carter.li --- ggml/src/ggml-quants.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9016314f5..16aaf523f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // compute mask for subtraction vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); m <<= 1; vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); m <<= 1; vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); m <<= 1; vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); m <<= 1; // load Q8 and take product with Q3 @@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl)); vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl); + vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl); m <<= 1; vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl)); vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl); + vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl); m <<= 1; vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl); From c887d8b01726b11ea03dbcaa9d44fa74422d0076 Mon Sep 17 00:00:00 2001 From: zhentaoyu Date: Tue, 30 Jul 2024 14:56:51 +0800 Subject: [PATCH 06/28] [SYCL] Add `TIMESTEP_EMBEDDING` OP (#8707) Signed-off-by: zhentaoyu --- ggml/src/ggml-sycl.cpp | 4 ++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/presets.hpp | 1 + ggml/src/ggml-sycl/tsembd.cpp | 71 ++++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/tsembd.hpp | 21 ++++++++++ 5 files changed, 98 insertions(+) create mode 100644 ggml/src/ggml-sycl/tsembd.cpp create mode 100644 ggml/src/ggml-sycl/tsembd.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index d1dd07f64..d8eb86c2c 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4108,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_ARGSORT: func = ggml_sycl_argsort; break; + case GGML_OP_TIMESTEP_EMBEDDING: + func = ggml_sycl_op_timestep_embedding; + break; default: return false; } @@ -5225,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: + case GGML_OP_TIMESTEP_EMBEDDING: return true; default: return false; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 98b0ebc19..58dd9c9a6 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -24,5 +24,6 @@ #include "rope.hpp" #include "norm.hpp" #include "softmax.hpp" +#include "tsembd.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 479789626..340ab8e93 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -42,6 +42,7 @@ #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 +#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp new file mode 100644 index 000000000..d5c227cd1 --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -0,0 +1,71 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "tsembd.hpp" + +static void timestep_embedding_f32( + const float * timesteps, float * dst, const int nb1, + const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) { + // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0] + // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = item_ct1.get_group(1); + int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half); + float arg = timestep * freq; + embed_data[j] = sycl::cos(arg); + embed_data[j + half] = sycl::sin(arg); +} + +static void timestep_embedding_f32_sycl( + const float * x, float * dst, const int ne00, const int nb1, + const int dim, const int max_period, const queue_ptr& stream) { + // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad + int half_ceil = dim / 2; + int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE; + sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE); + sycl::range<3> gridDim(1, ne00, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + gridDim * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + timestep_embedding_f32( + x, dst, nb1, dim, max_period, item_ct1 + ); + }); +} + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst) { + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); +} diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp new file mode 100644 index 000000000..ff854c337 --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_TSEMBD_HPP +#define GGML_SYCL_TSEMBD_HPP + +#include "common.hpp" + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst); + +#endif // GGML_SYCL_TSEMBD_HPP From 6e2b6000e5fe808954a7dcef8225b5b7f2c1b9e9 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Tue, 30 Jul 2024 18:37:35 +0800 Subject: [PATCH 07/28] cann: update cmake (#8765) --- ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index a5c2e96a8..7fe1661bb 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -207,6 +207,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-alloc.h include/ggml-backend.h include/ggml-blas.h + include/ggml-cann.h include/ggml-cuda.h include/ggml.h include/ggml-kompute.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 836496fb9..425a25895 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -849,11 +849,6 @@ if (GGML_CANN) ${CANN_INSTALL_DIR}/acllib/include ) - # TODO: find libs - link_directories( - ${CANN_INSTALL_DIR}/lib64 - ) - add_subdirectory(ggml-cann/kernels) list(APPEND CANN_LIBRARIES ascendcl @@ -872,6 +867,7 @@ if (GGML_CANN) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) + set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() else() From 140074bb8647df41840d6f32f4409fa8959bcf9f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 30 Jul 2024 15:58:57 +0300 Subject: [PATCH 08/28] flake.lock: Update (#8729) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 940cda6a4..3dc68abb6 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1721379653, - "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=", + "lastModified": 1722062969, + "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374", + "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3", "type": "github" }, "original": { From 7c27a19b2eb91bb0f43c7f7aec0386cec2dddc33 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Tue, 30 Jul 2024 23:40:18 +0900 Subject: [PATCH 09/28] added android implementation of ggml_print_backtrace_symbols (#8751) * added android implementation of ggml_print_backtrace_symbols * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren --------- Co-authored-by: slaren --- ggml/src/ggml.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c76d00a39..4d5667884 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -141,7 +141,51 @@ typedef pthread_t ggml_thread_t; #include -#if defined(__linux__) +#if defined(__ANDROID__) +#include +#include +#include + +struct backtrace_state { + void ** current; + void ** end; +}; + +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { + struct backtrace_state * state = (struct backtrace_state *)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = (void*)pc; + } + } + return _URC_NO_REASON; +} + +static void ggml_print_backtrace_symbols(void) { + const int max = 100; + void* buffer[max]; + + struct backtrace_state state = {buffer, buffer + max}; + _Unwind_Backtrace(unwind_callback, &state); + + int count = state.current - buffer; + + for (int idx = 0; idx < count; ++idx) { + const void * addr = buffer[idx]; + const char * symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); + } +} +#elif defined(__linux__) #include static void ggml_print_backtrace_symbols(void) { void * trace[100]; From 7e72aa74fd676a093eb9970e761085ec22734c71 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 31 Jul 2024 00:57:03 +1000 Subject: [PATCH 10/28] py: add_array() will not add to kv store if value is an empty array (#8774) * gguf_writer.py: add_array() should not add to kv store if empty * Apply suggestions from code review I was wondering if there was a specific reason for `if val` but good to hear we can safely use `len(val == 0` Co-authored-by: compilade --------- Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ba6f53cda..2e0b335ee 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -312,6 +312,8 @@ class GGUFWriter: self.add_key_value(key, val, GGUFValueType.STRING) def add_array(self, key: str, val: Sequence[Any]) -> None: + if len(val) == 0: + return self.add_key_value(key, val, GGUFValueType.ARRAY) @staticmethod @@ -845,7 +847,14 @@ class GGUFWriter: encoded_val = val.encode("utf-8") if isinstance(val, str) else val kv_data += self._pack("Q", len(encoded_val)) kv_data += encoded_val - elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: + elif vtype == GGUFValueType.ARRAY: + + if not isinstance(val, Sequence): + raise ValueError("Invalid GGUF metadata array, expecting sequence") + + if len(val) == 0: + raise ValueError("Invalid GGUF metadata array. Empty array") + if isinstance(val, bytes): ltype = GGUFValueType.UINT8 else: From 268c5660062270a2c19a36fc655168aa287aaec2 Mon Sep 17 00:00:00 2001 From: Someone Date: Tue, 30 Jul 2024 23:35:30 +0300 Subject: [PATCH 11/28] nix: cuda: rely on propagatedBuildInputs (#8772) Listing individual outputs no longer necessary to reduce the runtime closure size after https://github.com/NixOS/nixpkgs/pull/323056. --- .devops/nix/package.nix | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 911c42ecb..a87423c71 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -126,16 +126,9 @@ let ++ optionals useMetalKit [ MetalKit ]; cudaBuildInputs = with cudaPackages; [ - cuda_cccl.dev # - - # A temporary hack for reducing the closure size, remove once cudaPackages - # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 - cuda_cudart.dev - cuda_cudart.lib - cuda_cudart.static - libcublas.dev - libcublas.lib - libcublas.static + cuda_cudart + cuda_cccl # + libcublas ]; rocmBuildInputs = with rocmPackages; [ From 44d28ddd5caaa5e9de573bdaaa5b5b2448a29ace Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 31 Jul 2024 16:40:08 +0300 Subject: [PATCH 12/28] cmake : fix use of external ggml (#8787) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 793709122..a31320635 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # -get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS) +get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) +get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) From 398ede5efeb07b9adf9fbda7ea63f630d476a792 Mon Sep 17 00:00:00 2001 From: pculliton Date: Wed, 31 Jul 2024 11:12:10 -0400 Subject: [PATCH 13/28] Adding Gemma 2 2B configs (#8784) * Adding Gemma 2 2B configs Updates to Q scaling and Gemma 2 model sizes to match v2 2B model. * Update src/llama.cpp Co-authored-by: slaren --------- Co-authored-by: slaren --- src/llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index a207451f5..e6f303d31 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4969,6 +4969,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -11736,6 +11737,7 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; default: GGML_ABORT("fatal error"); From ed9d2854c9de4ae1f448334294e61167b04bec2a Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Wed, 31 Jul 2024 15:51:06 -0400 Subject: [PATCH 14/28] Build: Fix potential race condition (#8781) * Fix potential race condition as pointed out by @fairydreaming in #8776 * Reference the .o rather than rebuilding every time. * Adding in CXXFLAGS and LDFLAGS * Removing unnecessary linker flags. --- Makefile | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index c82f4268a..f4ce4f1fb 100644 --- a/Makefile +++ b/Makefile @@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Mark legacy binary targets as .PHONY so that they are always checked. .PHONY: main quantize perplexity embedding server +# Define the object file target +examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +main: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +server: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.cpp +quantize: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif -perplexity: examples/deprecation-warning/deprecation-warning.cpp +perplexity: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif -embedding: examples/deprecation-warning/deprecation-warning.cpp +embedding: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." From afbbcf3c04e3c6420cad3d72571478cd62ac176c Mon Sep 17 00:00:00 2001 From: Igor Okulist Date: Wed, 31 Jul 2024 18:59:09 -0500 Subject: [PATCH 15/28] server : update llama-server embedding flag documentation (#8779) Fixes #8763 --- common/common.cpp | 2 +- examples/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 60c7eac75..521f849e2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); diff --git a/examples/server/README.md b/examples/server/README.md index 33a2b95cc..de83ee7d0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -247,7 +247,7 @@ server: --host HOST ip address to listen (default: 127.0.0.1) --port PORT port to listen (default: 8080) --path PATH path to serve static files from (default: ) - --embedding(s) enable embedding endpoint (default: disabled) + --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) --api-key KEY API key to use for authentication (default: none) --api-key-file FNAME path to file containing API keys (default: none) --ssl-key-file FNAME path to file a PEM-encoded SSL private key From c8a0090922bad576623de4aae227717085249262 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Thu, 1 Aug 2024 10:39:05 +0800 Subject: [PATCH 16/28] cann: support q8_0 for Ascend backend (#8805) --- ggml/src/ggml-cann/aclnn_ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index f27666970..90ccf3e18 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2381,10 +2381,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - ggml_cann_pool_alloc input_alloctor( - ctx.pool(), ggml_nelements(src1) * input_elem_size); + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; From 7a11eb3a260915aee16101808f291a244e2facc7 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 1 Aug 2024 15:26:22 +0200 Subject: [PATCH 17/28] cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X (#8800) * cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X * update asserts * only use dmmv for supported types * add test --- ggml/src/ggml-cuda.cu | 5 ++--- ggml/src/ggml-cuda/dmmv.cu | 21 +++++++++++++++------ ggml/src/ggml-cuda/dmmv.cuh | 2 ++ tests/test-backend-ops.cpp | 5 +++-- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index c73ae40d4..b510777fb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1885,10 +1885,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 - && src1->ne[1] == 1; + && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index d7a2a2513..96a5adef5 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead const dim3 block_nums(block_num_y, 1, 1); @@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { + return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 || + src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 || + src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || + src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || + src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || + src0_type == GGML_TYPE_F16; +} diff --git a/ggml/src/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh index 4c5ebd475..e727eb97f 100644 --- a/ggml/src/ggml-cuda/dmmv.cuh +++ b/ggml/src/ggml-cuda/dmmv.cuh @@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2fa59fd0a..5de70d554 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -804,8 +804,7 @@ struct test_cpy : public test_case { test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, std::array ne = {10, 10, 10, 1}, - std::array permute = {0, 0, 0, 0}, - bool _dst_use_permute = false) + std::array permute = {0, 0, 0, 0}) : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} @@ -2269,6 +2268,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { + + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), { 1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); } } From b7a08fd5e0e7c898c68d1743066ea495202d9608 Mon Sep 17 00:00:00 2001 From: Alex O'Connell <35843486+acon96@users.noreply.github.com> Date: Thu, 1 Aug 2024 12:53:46 -0400 Subject: [PATCH 18/28] Build: Only include execinfo.h on linux systems that support it (#8783) * Only enable backtrace on GLIBC linux systems * fix missing file from copy * use glibc macro instead of defining a custom one --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4d5667884..a4e89cf32 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -185,7 +185,7 @@ static void ggml_print_backtrace_symbols(void) { fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); } } -#elif defined(__linux__) +#elif defined(__linux__) && defined(__GLIBC__) #include static void ggml_print_backtrace_symbols(void) { void * trace[100]; From afbb4c1322a747d2a7b4bf67c868148f8afcc6c8 Mon Sep 17 00:00:00 2001 From: matteo Date: Thu, 1 Aug 2024 23:28:28 +0200 Subject: [PATCH 19/28] ggml-cuda: Adding support for unified memory (#8035) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding support for unified memory * adding again the documentation about unified memory * refactoring: Moved the unified memory code in the correct location. * Fixed compilation error when using hipblas * cleaning up the documentation * Updating the documentation Co-authored-by: Johannes Gäßler * adding one more case where the PR should not be enabled --------- Co-authored-by: matteo serva Co-authored-by: Johannes Gäßler --- docs/build.md | 6 +++++- ggml/src/ggml-cuda.cu | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/build.md b/docs/build.md index cfe42ebbf..8b16d1a35 100644 --- a/docs/build.md +++ b/docs/build.md @@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht cmake --build build --config Release ``` -The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: +The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. + +The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. + +The following compilation options are also available to tweak performance: | Option | Legal values | Default | Description | |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b510777fb..68605fff6 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) } return res; #else + +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + cudaError_t err; + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) + { + err = cudaMallocManaged(ptr, size); + } + else + { + err = cudaMalloc(ptr, size); + } + return err; +#else return cudaMalloc(ptr, size); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #endif } From 0fbbd884589d585c3b43cae8c16938ffffb863b9 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 2 Aug 2024 01:55:17 +0100 Subject: [PATCH 20/28] [SYCL] Fixing wrong VDR iq4nl value (#8812) --- ggml/src/ggml-sycl/mmvq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 23232357e..1b96925e1 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq4_nl_q8_1( + mul_mat_vec_q_iq4_nl_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); From e09a800f9a9b19c73aa78e03b4c4be8ed988f3e6 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 2 Aug 2024 16:50:53 +0800 Subject: [PATCH 21/28] cann: Fix ggml_cann_im2col for 1D im2col (#8819) * fix ggml_cann_im2col for 1D im2col * fix build warning --- ggml/src/ggml-cann/aclnn_ops.cpp | 167 +++++++++++++++++++++++++------ tests/test-backend-ops.cpp | 3 + 2 files changed, 142 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 90ccf3e18..556284888 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1312,6 +1312,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, #ifdef __cplusplus } #endif + +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + ggml_tensor* src1, + aclTensor* tmp_cast_tensor, + aclTensor* tmp_im2col_tensor) { + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_im2col_1d_post_process( + ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, + aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, + const std::vector& im2col_op_params) { + // get params + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; + const int64_t n_bytes_factor = im2col_op_params[10]; + + // Permute: [N, IC * KH * KW, OW * OH] -> + // [N, OW * OH * n_bytes_factor, IC * KH * KW] + aclTensor* tmp_permute_tensor = nullptr; + ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); + tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + void* tmp_permute_buffer = tmp_permute_allocator.get(); + + int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + tmp_permute_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + tmp_permute_tensor = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, + 3); + } + + // number of times the kernel moves in W dimension + const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; + size_t offset; + void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + + // memory copy with offset to restore 1D im2col from 2d + if (IC > 1) { + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + size_t size_cpy = KH * KW * ggml_type_size(dst->type); + + for (int c = 0; c < IC; c++) { + cur_permute_buffer = (char*)tmp_permute_buffer + offset + + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char*)dst->data + + c * KH * KW * n_step_w * ggml_type_size(dst->type); + + for (int i = 0; i < n_step_w; i++) { + ACL_CHECK(aclrtMemcpyAsync( + cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + cur_dst_buffer = + (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char*)cur_permute_buffer + + KH * KW * IC * ggml_type_size(dst->type); + } + } + } else { + offset = KH * KW * n_step_w * + ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, + (char*)tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // release + ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); +} + void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src1 = dst->src[1]; // input @@ -1320,21 +1425,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - GGML_TENSOR_BINARY_OP_LOCALS; - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; + // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D + // im2col and do post-processing to restore it to 1D. + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; - const int64_t KH = is_2D ? ne01 : 1; + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t KH = ne01; const int64_t KW = ne00; + const int64_t IW = ne10; const int64_t OH = is_2D ? ne2 : 1; const int64_t OW = ne1; @@ -1342,9 +1449,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + // memory allocated increased to 3x when is_2D == false + const int64_t n_bytes_factor = is_2D ? 1 : 3; + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; + int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; tmp_im2col_nb[0] = ggml_type_size(src1->type); @@ -1356,8 +1466,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. ggml_cann_pool_alloc im2col_allocator( - ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + ctx.pool(), + ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); void* tmp_im2col_buffer = im2col_allocator.get(); + aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, @@ -1380,8 +1492,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { paddings, strides, tmp_im2col_tensor, &workspaceSize, &executor)); + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); if (workspaceSize > 0) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspace_allocator.alloc(workspaceSize); workspaceAddr = workspace_allocator.get(); } @@ -1391,9 +1504,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); + void* tmp_cast_buffer = nullptr; if (src1->type != dst->type) { - tmp_cast_allocator.alloc(ggml_nbytes(dst)); - void* tmp_cast_buffer = tmp_cast_allocator.get(); + tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + tmp_cast_buffer = tmp_cast_allocator.get(); size_t temp_cast_nb[GGML_MAX_DIMS - 1]; temp_cast_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1408,24 +1522,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_type_mapping(dst->type)); } - // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] - int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); - - int64_t permute_dim[] = {0, 2, 1}; - if (src1->type != dst->type) { - aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + // post-processing + if (is_2D) { + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor); } else { - aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + std::vector im2col_op_params = { + KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor, im2col_op_params); } // release ACL_CHECK(aclDestroyTensor(acl_src1)); ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyIntArray(kernel_size)); ACL_CHECK(aclDestroyIntArray(dilations)); ACL_CHECK(aclDestroyIntArray(paddings)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5de70d554..f5065f145 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2139,6 +2139,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); + // test cases for 1D im2col + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); test_cases.emplace_back(new test_conv_transpose_1d()); test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1)); From b72c20b85c1029d135022d39e9a20d4807c11893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 2 Aug 2024 21:11:39 +0200 Subject: [PATCH 22/28] Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin --- convert_hf_to_gguf.py | 2 +- ggml/include/ggml.h | 1 + ggml/src/ggml-impl.h | 9 +++------ ggml/src/ggml.c | 11 +++++++++-- gguf-py/gguf/quants.py | 14 ++++++-------- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8ba3c5844..8b33c30d9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -316,7 +316,7 @@ class Model: if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data = gguf.quantize_bf16(data) - assert data.dtype == np.int16 + assert data.dtype == np.uint16 data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 464d765da..d8d3dceef 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -349,6 +349,7 @@ extern "C" { GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t); GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); struct ggml_object; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 7f7afdbfc..3daee4926 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { /** * Converts float32 to brain16. * - * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. - * Subnormals shall be flushed to zero, and NANs will be quiet. + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { @@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } - if (!(u.i & 0x7f800000)) { /* subnormal */ - h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ - return h; - } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a4e89cf32..be672f6ef 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -480,9 +480,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { + for (int i = 0; i < n; i++) { + y[i] = ggml_compute_fp32_to_bf16(x[i]); + } +} + void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) + // subnormals are flushed to zero on this platform for (; i + 32 <= n; i += 32) { _mm512_storeu_si512( (__m512i *)(y + i), @@ -962,7 +969,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, - .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -20650,7 +20657,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_BF16: { size_t elemsize = sizeof(ggml_bf16_t); - ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n); + ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; case GGML_TYPE_F32: diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 16e0a9aaa..f4361d751 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -25,14 +25,12 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati # same as ggml_compute_fp32_to_bf16 in ggml-impl.h def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: - n = n.astype(np.float32, copy=False).view(np.int32) + n = n.astype(np.float32, copy=False).view(np.uint32) # force nan to quiet - n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n) - # flush subnormals to zero - n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n) + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) # round to nearest even - n = (n + (0x7fff + ((n >> 16) & 1))) >> 16 - return n.astype(np.int16) + n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 + return n.astype(np.uint16) # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time @@ -49,10 +47,10 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape) + return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape) -__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16) +__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16) def quantize_bf16(n: np.ndarray): From 76614f352e94d25659306d9e97321f204e5de0d3 Mon Sep 17 00:00:00 2001 From: jdomke <28772296+jdomke@users.noreply.github.com> Date: Sun, 4 Aug 2024 01:34:41 +0900 Subject: [PATCH 23/28] ggml : reading the runtime sve config of the cpu (#8709) * ggml : reading the runtime sve config of the cpu * change to one time init to prevent performance drop * prefix variable to avoid possible conflicts * revert xxhash fix and add brackets --------- Co-authored-by: domke <673751-domke@users.noreply.gitlab.com> --- ggml/src/ggml-aarch64.c | 28 ++++++++++++++-------------- ggml/src/ggml-impl.h | 1 + ggml/src/ggml-quants.c | 4 ++-- ggml/src/ggml-quants.h | 4 ++++ ggml/src/ggml.c | 9 +++++++++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index af53dea17..d7a608997 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 3daee4926..190af0810 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -143,6 +143,7 @@ extern "C" { #if defined(__ARM_FEATURE_SVE) #include +#include #endif // 16-bit float diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 16aaf523f..d5b91c2db 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); @@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 88b1f3269..525d5ee30 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +#if defined(__ARM_FEATURE_SVE) +extern int ggml_sve_cnt_b; +#endif + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index be672f6ef..42f4a34b8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -37,6 +37,9 @@ #include #endif +#if defined(__ARM_FEATURE_SVE) +int ggml_sve_cnt_b = 0; +#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -3558,6 +3561,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); +#if defined(__ARM_FEATURE_SVE) + if (!ggml_sve_cnt_b) { + ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + } +#endif + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); From 4b77ea95f56a4c49bc995f08eac62a6416875ccc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 4 Aug 2024 05:53:20 +0300 Subject: [PATCH 24/28] flake.lock: Update (#8847) --- flake.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index 3dc68abb6..c54af88ea 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1719994518, - "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=", + "lastModified": 1722555600, + "narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7", + "rev": "8471fe90ad337a8074e957b69ca4d0089218391d", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1722062969, - "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=", + "lastModified": 1722421184, + "narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3", + "rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1719876945, - "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=", + "lastModified": 1722555339, + "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" } }, "root": { From 01aae2b4975b57a265ce8194928fd87f2d71027e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 3 Aug 2024 15:07:47 +0200 Subject: [PATCH 25/28] baby-llama : remove duplicate vector include --- examples/baby-llama/baby-llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 4f6c3746a..aca332e94 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,7 +1,6 @@ #include "ggml.h" #include "train.h" -#include #include #include #include From ecf6b7f23e664afd7ff856ec39034240ce438daa Mon Sep 17 00:00:00 2001 From: Brian Cunnie Date: Sun, 4 Aug 2024 03:55:03 -0700 Subject: [PATCH 26/28] batched-bench : handle empty `-npl` (#8839) * [example] batched-bench "segmentation fault" When `llama-batched-bench` is invoked _without_ setting `-npl`, "number of parallel prompts", it segfaults. The segfault is caused by invoking `max_element()` on a zero-length vector, `n_pl` This commit addresses that by first checking to see if the number of parallel prompts is zero, and if so sets the maximum sequence size to 1; otherwise, sets it to the original, the result of `max_element()`. Fixes, when running `lldb build/bin/llama-batched-bench -- -m models/Meta-Llama-3-8B.gguf` ``` * thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x0) frame #0: 0x000000010000366c llama-batched-bench`main(argc=3, argv=0x000000016fdff268) at batched-bench.cpp:72:28 69 llama_context_params ctx_params = llama_context_params_from_gpt_params(params); 70 71 // ensure enough sequences are available -> 72 ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); ``` * Update examples/batched-bench/batched-bench.cpp Co-authored-by: compilade --------- Co-authored-by: Georgi Gerganov Co-authored-by: compilade --- examples/batched-bench/batched-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 718f0a61a..25e7c775a 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -69,7 +69,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_params_from_gpt_params(params); // ensure enough sequences are available - ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); + ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); llama_context * ctx = llama_new_context_with_model(model, ctx_params); From 978ba3d83d17b10fdf9807006048432b5b3769fc Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:16:23 +0000 Subject: [PATCH 27/28] Server: Don't ignore llama.cpp params (#8754) * Don't ignore llama.cpp params * Add fallback for max_tokens --- examples/server/server.cpp | 2 +- examples/server/utils.hpp | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7813a2957..d5f131d9b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -900,7 +900,7 @@ struct server_context { slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index db6b3b74d..e6a1f0697 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse( llama_params["__oaicompat"] = true; - // Map OpenAI parameters to llama.cpp parameters - // - // For parameters that are defined by the OpenAI documentation (e.g. - // temperature), we explicitly specify OpenAI's intended default; we - // need to do that because sometimes OpenAI disagrees with llama.cpp - // - // https://platform.openai.com/docs/api-reference/chat/create - llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); - llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); - llama_params["stream"] = json_value(body, "stream", false); - llama_params["temperature"] = json_value(body, "temperature", 1.0); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - // Apply chat template to the list of messages llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); From 0d6fb52be0c1b7e77eb855f3adc4952771c8ce4c Mon Sep 17 00:00:00 2001 From: Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Date: Sun, 4 Aug 2024 14:17:16 -0400 Subject: [PATCH 28/28] Install curl in runtime layer (#8693) --- .devops/llama-server.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index b631d5806..ff558604e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev curl + apt-get install -y build-essential git libcurl4-openssl-dev WORKDIR /app @@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 + apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/llama-server /llama-server