llama.cpp/ggml-cuda.cu

#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"

#if defined(GGML_USE_HIPBLAS)
#define GGML_COMMON_DECL_HIP
#define GGML_COMMON_IMPL_HIP
#else
#define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA
#endif
#include "ggml-common.h"

#include <algorithm>
#include <array>
#include <assert.h>
#include <atomic>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <float.h>
#include <limits>
#include <map>
#include <memory>
#include <mutex>
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <vector>

// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
#else
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#endif
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define __trap abort
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#else
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020

#endif // defined(GGML_USE_HIPBLAS)

#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

#define CC_PASCAL     600
#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA      700
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
#define CC_RDNA3      (CC_OFFSET_AMD + 1100)

// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// -  7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ

// TODO: improve this to be correct for more hardware
//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif

#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available

#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses


#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

[[noreturn]]
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
    cudaGetDevice(&id);

    fprintf(stderr, "CUDA error: %s\n", msg);
    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
    fprintf(stderr, "  %s\n", stmt);
    // abort with GGML_ASSERT to get a stack trace
    GGML_ASSERT(!"CUDA error");
}

#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
     do {                                                                           \
        auto err_ = (err);                                                          \
        if (err_ != (success)) {                                                    \
            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
        }                                                                           \
    } while (0)

#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)

#if CUDART_VERSION >= 12000
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        return cublasGetStatusString(err);
    }
#else
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        switch (err) {
            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
            default: return "unknown error";
        }
    }
#endif // CUDART_VERSION >= 12000

#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)

#if !defined(GGML_USE_HIPBLAS)
static const char * cu_get_error_str(CUresult err) {
    const char * err_str;
    cuGetErrorString(err, &err_str);
    return err_str;
}
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif

#if CUDART_VERSION >= 11100
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100


#define GGML_CUDA_MAX_STREAMS 8

struct ggml_tensor_extra_gpu {
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
};

// this is faster on Windows
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
static void ggml_cuda_set_device(const int device) {
    int current_device;
    CUDA_CHECK(cudaGetDevice(&current_device));

    if (device == current_device) {
        return;
    }

    CUDA_CHECK(cudaSetDevice(device));
}

static int ggml_cuda_get_device() {
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    return id;
}

struct ggml_cuda_device_info {
    int device_count;

    struct cuda_device_info {
        int     cc;                 // compute capability
        size_t  smpb;               // max. shared memory per block
        bool    vmm;                // virtual memory support
        size_t  vmm_granularity;    // granularity of virtual memory
        size_t  total_vram;
    };

    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};

    std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
};

static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__
    // Workaround for a rocBLAS bug when using multiple graphics cards:
    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
    rocblas_initialize();
    CUDA_CHECK(cudaDeviceSynchronize());
#endif

    ggml_cuda_device_info info = {};

    cudaError_t err = cudaGetDeviceCount(&info.device_count);
    if (err != cudaSuccess) {
        fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
        return info;
    }

    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);

    int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ)
    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
#else
    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
#endif
#if defined(CUDA_USE_TENSOR_CORES)
    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
#else
    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
#endif
    fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

#if !defined(GGML_USE_HIPBLAS)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));

        if (device_vmm) {
            CUmemAllocationProp alloc_prop = {};
            alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
            alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
#endif // !defined(GGML_USE_HIPBLAS)
        info.devices[id].vmm = !!device_vmm;

        cudaDeviceProp prop;
        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
        fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

        info.default_tensor_split[id] = total_vram;
        total_vram += prop.totalGlobalMem;

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        info.devices[id].smpb = prop.sharedMemPerBlock;
    }

    for (int id = 0; id < info.device_count; ++id) {
        info.default_tensor_split[id] /= total_vram;
    }

    // configure logging to stdout
    // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

    return info;
}

static const ggml_cuda_device_info & get_cuda_global_info() {
    static ggml_cuda_device_info info = ggml_cuda_init();
    return info;
}

// #define DEBUG_CUDA_MALLOC

// buffer pool for cuda (legacy)
struct ggml_cuda_pool {
    virtual ~ggml_cuda_pool() = default;

    virtual void * alloc(size_t size, size_t * actual_size) = 0;
    virtual void free(void * ptr, size_t size) = 0;
};

struct ggml_cuda_pool_leg : public ggml_cuda_pool {
    static const int MAX_BUFFERS = 256;

    int device;
    struct ggml_cuda_buffer {
        void * ptr = nullptr;
        size_t size = 0;
    };

    ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
    size_t pool_size = 0;

    explicit ggml_cuda_pool_leg(int device) :
        device(device) {
    }

    ~ggml_cuda_pool_leg() {
        ggml_cuda_set_device(device);
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
                CUDA_CHECK(cudaFree(b.ptr));
                pool_size -= b.size;
            }
        }
        GGML_ASSERT(pool_size == 0);
    }

    void * alloc(size_t size, size_t * actual_size) override {
#ifdef DEBUG_CUDA_MALLOC
        int nnz = 0;
        size_t max_size = 0;
#endif
        size_t best_diff = 1ull << 36;
        int ibest = -1;
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer& b = buffer_pool[i];
            if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
                ++nnz;
                if (b.size > max_size) max_size = b.size;
#endif
                if (b.size >= size) {
                    size_t diff = b.size - size;
                    if (diff < best_diff) {
                        best_diff = diff;
                        ibest = i;
                        if (!best_diff) {
                            void * ptr = b.ptr;
                            *actual_size = b.size;
                            b.ptr = nullptr;
                            b.size = 0;
                            return ptr;
                        }
                    }
                }
            }
        }
        if (ibest >= 0) {
            ggml_cuda_buffer& b = buffer_pool[ibest];
            void * ptr = b.ptr;
            *actual_size = b.size;
            b.ptr = nullptr;
            b.size = 0;
            return ptr;
        }
        void * ptr;
        size_t look_ahead_size = (size_t) (1.05 * size);
        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
        ggml_cuda_set_device(device);
        CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
                (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
#endif
        return ptr;
    }

    void free(void * ptr, size_t size) override {
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer& b = buffer_pool[i];
            if (b.ptr == nullptr) {
                b.ptr = ptr;
                b.size = size;
                return;
            }
        }
        fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
        ggml_cuda_set_device(device);
        CUDA_CHECK(cudaFree(ptr));
        pool_size -= size;
    }
};

// pool with virtual memory
#if !defined(GGML_USE_HIPBLAS)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

    int device;
    CUdeviceptr pool_addr = 0;
    size_t pool_used = 0;
    size_t pool_size = 0;
    size_t granularity;

    explicit ggml_cuda_pool_vmm(int device) :
        device(device),
        granularity(get_cuda_global_info().devices[device].vmm_granularity) {
    }

    ~ggml_cuda_pool_vmm() {
        if (pool_addr != 0) {
            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
        }
    }

    void * alloc(size_t size, size_t * actual_size) override {
        // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
        const size_t alignment = 128;
        size = alignment * ((size + alignment - 1) / alignment);

        size_t avail = pool_size - pool_used;

        if (size > avail) {
            // round up to the next multiple of the granularity
            size_t reserve_size = size - avail;
            reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);

            GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);

            // allocate more physical memory
            CUmemAllocationProp prop = {};
            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            prop.location.id = device;
            CUmemGenericAllocationHandle handle;
            CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));

            // reserve virtual address space (if not already reserved)
            if (pool_addr == 0) {
                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
            }

            // map at the end of the pool
            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));

            // the memory allocation handle is no longer needed after mapping
            CU_CHECK(cuMemRelease(handle));

            // set access
            CUmemAccessDesc access = {};
            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            access.location.id = device;
            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));

            // add to the pool
            pool_size += reserve_size;

            //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
            //       id, (unsigned long long) (pool_size[id]/1024/1024),
            //       (unsigned long long) (reserve_size/1024/1024));
        }

        GGML_ASSERT(pool_addr != 0);

        void * ptr = (void *) (pool_addr + pool_used);
        *actual_size = size;
        pool_used += size;

#ifdef DEBUG_CUDA_MALLOC
        printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
#endif

        return ptr;
    }

    void free(void * ptr, size_t size) override {
#ifdef DEBUG_CUDA_MALLOC
        printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
#endif

        pool_used -= size;

        // all deallocations must be in reverse order of the allocations
        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
    }
};
#endif // !defined(GGML_USE_HIPBLAS)

template<typename T>
struct ggml_cuda_pool_alloc {
    ggml_cuda_pool * pool = nullptr;
    T * ptr = nullptr;
    size_t actual_size = 0;

    ggml_cuda_pool_alloc() = default;

    explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
    }

    ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
        alloc(size);
    }

    ~ggml_cuda_pool_alloc() {
        if (ptr != nullptr) {
            pool->free(ptr, actual_size);
        }
    }

    // size is in number of elements
    T * alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
        return ptr;
    }

    T * alloc(ggml_cuda_pool & pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }

    T * get() {
        return ptr;
    }

    ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
    ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
    ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
    ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
};


// backend interface

struct ggml_backend_cuda_context {
    int device;
    std::string name;
    cudaEvent_t copy_event = nullptr;

    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
    cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

    explicit ggml_backend_cuda_context(int device) :
        device(device),
        name(GGML_CUDA_NAME + std::to_string(device)) {
    }

    ~ggml_backend_cuda_context() {
        if (copy_event != nullptr) {
            CUDA_CHECK(cudaEventDestroy(copy_event));
        }
        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
            for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
                if (streams[i][j] != nullptr) {
                    CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
                }
            }
            if (cublas_handles[i] != nullptr) {
                CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
            }
        }
    }

    cudaStream_t stream(int device, int stream) {
        if (streams[device][stream] == nullptr) {
            ggml_cuda_set_device(device);
            CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
        }
        return streams[device][stream];
    }

    cudaStream_t stream() {
        return stream(device, 0);
    }

    cublasHandle_t cublas_handle(int device) {
        if (cublas_handles[device] == nullptr) {
            ggml_cuda_set_device(device);
            CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
            CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
        }
        return cublas_handles[device];
    }

    cublasHandle_t cublas_handle() {
        return cublas_handle(device);
    }

    // pool
    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];

    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS)
        if (get_cuda_global_info().devices[device].vmm) {
            return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
        }
#endif
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
    }

    ggml_cuda_pool & pool(int device) {
        if (pools[device] == nullptr) {
            pools[device] = new_pool_for_device(device);
        }
        return *pools[device];
    }

    ggml_cuda_pool & pool() {
        return pool(device);
    }
};

// cuda buffer

struct ggml_backend_cuda_buffer_context {
    int device;
    void * dev_ptr = nullptr;
    std::string name;

    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
        device(device), dev_ptr(dev_ptr),
        name(GGML_CUDA_NAME + std::to_string(device)) {
    }

    ~ggml_backend_cuda_buffer_context() {
        CUDA_CHECK(cudaFree(dev_ptr));
    }
};

GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    return ctx->name.c_str();
}

GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
}

GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    delete ctx;
}

GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    return ctx->dev_ptr;
}

GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    if (tensor->view_src != NULL && tensor->view_offs == 0) {
        assert(tensor->view_src->buffer->buft == buffer->buft);
        tensor->backend = tensor->view_src->backend;
        tensor->extra = tensor->view_src->extra;
        return;
    }

    if (ggml_is_quantized(tensor->type)) {
        // initialize padding to 0 to avoid possible NaN values
        size_t original_size = ggml_nbytes(tensor);
        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

        if (padded_size > original_size && tensor->view_src == nullptr) {
            ggml_cuda_set_device(ctx->device);
            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
        }
    }
}

GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}

GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}

GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
    if (ggml_backend_buffer_is_cuda(src->buffer)) {
        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
        if (src_ctx->device == dst_ctx->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
        } else {
            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
        }
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
        return true;
    }
    return false;

    GGML_UNUSED(buffer);
}

GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
    CUDA_CHECK(cudaDeviceSynchronize());
}

static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
    /* .clear           = */ ggml_backend_cuda_buffer_clear,
    /* .reset           = */ NULL,
};

// cuda buffer type
struct ggml_backend_cuda_buffer_type_context {
    int device;
    std::string name;
};

GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;

    return ctx->name.c_str();
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;

    ggml_cuda_set_device(buft_ctx->device);

    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0

    void * dev_ptr;
    cudaError_t err = cudaMalloc(&dev_ptr, size);
    if (err != cudaSuccess) {
        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
        return nullptr;
    }

    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
}

GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 128;

    GGML_UNUSED(buft);
}

GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];

    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
    }

    return size;

    GGML_UNUSED(buft);
}

GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_cuda(backend)) {
        return false;
    }

    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return buft_ctx->device == cuda_ctx->device;
}

static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);

    if (device >= ggml_backend_cuda_get_device_count()) {
        return nullptr;
    }

    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];

    static bool ggml_backend_cuda_buffer_type_initialized = false;

    if (!ggml_backend_cuda_buffer_type_initialized) {
        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
            ggml_backend_cuda_buffer_types[i] = {
                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
            };
        }
        ggml_backend_cuda_buffer_type_initialized = true;
    }

    return &ggml_backend_cuda_buffer_types[device];
}

// cuda split buffer

static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
    int64_t min_compute_capability = INT_MAX;
    int64_t max_compute_capability = INT_MIN;
    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
            if (min_compute_capability > get_cuda_global_info().devices[id].cc) {
                min_compute_capability = get_cuda_global_info().devices[id].cc;
            }
            if (max_compute_capability < get_cuda_global_info().devices[id].cc) {
                max_compute_capability = get_cuda_global_info().devices[id].cc;
            }
        }
    }

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    switch(type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
        case GGML_TYPE_Q3_K:
            return min_compute_capability < CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        default:
            GGML_ASSERT(false);
    }
#else
    switch(type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q6_K:
            return 64;
        default:
            GGML_ASSERT(false);
    }
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
    const int64_t nrows = ggml_nrows(tensor);
    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);

    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
    *row_low -= *row_low % rounding;

    if (id == ggml_backend_cuda_get_device_count() - 1) {
        *row_high = nrows;
    } else {
        *row_high = nrows*tensor_split[id + 1];
        *row_high -= *row_high % rounding;
    }
}

static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
}

struct ggml_backend_cuda_split_buffer_type_context {
    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
};

struct ggml_backend_cuda_split_buffer_context {
    ~ggml_backend_cuda_split_buffer_context() {
        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
            for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
                for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
                    if (extra->events[id][is] != nullptr) {
                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
                    }
                }
                if (extra->data_device[id] != nullptr) {
                    CUDA_CHECK(cudaFree(extra->data_device[id]));
                }
            }
            delete extra;
        }
    }

    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
};

GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
    return GGML_CUDA_NAME "_Split";

    GGML_UNUSED(buffer);
}

static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
    GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
}

GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
    delete ctx;
}

GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
    return (void *)0x1000;

    GGML_UNUSED(buffer);
}

GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported

    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];

    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
    ctx->tensor_extras.push_back(extra);

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        // FIXME: do not crash if cudaMalloc fails
        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
        ggml_cuda_set_device(id);
        char * buf;
        CUDA_CHECK(cudaMalloc(&buf, size));

        // set padding to 0 to avoid possible NaN values
        if (size > original_size) {
            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
        }

        extra->data_device[id] = buf;

        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
        }
    }
    tensor->extra = extra;
}

GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];
    const size_t nb1 = tensor->nb[1];
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        const size_t offset_split = row_low*nb1;
        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        const char * buf_host = (const char *)data + offset_split;
        CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
    }

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
    }
}

GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];
    const size_t nb1 = tensor->nb[1];
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        const size_t offset_split = row_low*nb1;
        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        char * buf_host = (char *)data + offset_split;
        CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
    }

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
    }
}

GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    GGML_UNUSED(buffer);
    GGML_UNUSED(value);
}

static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
    /* .cpy_tensor      = */ NULL,
    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
    /* .reset           = */ NULL,
};

// cuda split buffer type

GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
    return GGML_CUDA_NAME "_Split";

    GGML_UNUSED(buft);
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
    // instead, we allocate them for each tensor separately in init_tensor
    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();

    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
}

GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 128;

    GGML_UNUSED(buft);
}

GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;

    size_t total_size = 0;

    const int64_t ne0 = tensor->ne[0];

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        total_size += ggml_nbytes_split(tensor, nrows_split);

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
    }

    return total_size;
}

GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_cuda(backend);

    GGML_UNUSED(buft);
}

GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return false;

    GGML_UNUSED(buft);
}

static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);

    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};

    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
    if (all_zero) {
        tensor_split_arr = get_cuda_global_info().default_tensor_split;
    } else {
        float split_sum = 0.0f;
        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
            tensor_split_arr[i] = split_sum;
            split_sum += tensor_split[i];
        }
        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
            tensor_split_arr[i] /= split_sum;
        }
    }

    auto it = buft_map.find(tensor_split_arr);
    if (it != buft_map.end()) {
        return &it->second;
    }

    struct ggml_backend_buffer_type buft {
        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
    };

    auto result = buft_map.emplace(tensor_split_arr, buft);
    return &result.first->second;
}

// host buffer type

GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
    return GGML_CUDA_NAME "_Host";

    GGML_UNUSED(buft);
}

GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
    return GGML_CUDA_NAME "_Host";

    GGML_UNUSED(buffer);
}

GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    CUDA_CHECK(cudaFreeHost(buffer->context));
}

static void * ggml_cuda_host_malloc(size_t size) {
    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
        return nullptr;
    }

    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();
        fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
            size/1024.0/1024.0, cudaGetErrorString(err));
        return nullptr;
    }

    return ptr;
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    void * ptr = ggml_cuda_host_malloc(size);

    if (ptr == nullptr) {
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }

    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
    buffer->buft = buft;
    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

    return buffer;
}

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
        /* .iface    = */ {
            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
    };

    return &ggml_backend_cuda_buffer_type_host;
}

//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
//    return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
//}


/// kernels


#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int &>(c);
#else
    int8x4_t c;
    int16_t tmp;
#pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int &>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
}

static __device__ __forceinline__ int __vsub4(const int a, const int b) {
    return __vsubss4(a, b);
}

static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
}

static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        "
        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
        : "v"(a), "v"(b)
    );
#else
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
    return c;
}
#endif // defined(GGML_USE_HIPBLAS)


#ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float
typedef half2 dfloat2;
#else
typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16

static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

    int x32 = 0;
    x32 |= x16[0] <<  0;
    x32 |= x16[1] << 16;

    return x32;
}

static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

    int x32 = 0;
    x32 |= x16[0] <<  0;
    x32 |= x16[1] << 16;

    return x32;
}

static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}

static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}

template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;

typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

typedef void (*ggml_cuda_func_t)(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);

typedef void (*ggml_cuda_op_mul_mat_t)(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);

typedef void (*ggml_cuda_op_flatten_t)(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);

typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
typedef void (*load_tiles_cuda_t)(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
typedef float (*vec_dot_q_mul_mat_cuda_t)(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);

#define WARP_SIZE 32

#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
#define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
#define CUDA_CLAMP_BLOCK_SIZE 256
#define CUDA_ROPE_BLOCK_SIZE 256
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
#define CUDA_ALIBI_BLOCK_SIZE 32
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_UPSCALE_BLOCK_SIZE 256
#define CUDA_CONCAT_BLOCK_SIZE 256
#define CUDA_PAD_BLOCK_SIZE 256
#define CUDA_ARANGE_BLOCK_SIZE 256
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
#define CUDA_ACC_BLOCK_SIZE 256
#define CUDA_IM2COL_BLOCK_SIZE 256
#define CUDA_POOL2D_BLOCK_SIZE 256

#define CUDA_Q8_0_NE_ALIGN 2048

// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_MMV_Y
#define GGML_CUDA_MMV_Y 1
#endif

#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 2
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif

#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE

#define MUL_MAT_SRC1_COL_STRIDE 128

[[noreturn]]
static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    GGML_UNUSED(arch_list);
#else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning
}

#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
//#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#define NO_DEVICE_CODE
#endif // __CUDA_ARCH__

static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
}

static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
}

#ifdef GGML_CUDA_F16
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
   }
   return a;
#else
   GGML_UNUSED(a);
   NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#endif // GGML_CUDA_F16

static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
    }
    return x;
}

//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//#pragma unroll
//    for (int mask = 16; mask > 0; mask >>= 1) {
//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
//    }
//    return x;
//#else
//    GGML_UNUSED(x);
//    NO_DEVICE_CODE;
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//}

static __device__ __forceinline__ float op_repeat(const float a, const float b) {
    return b;
    GGML_UNUSED(a);
}

static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
}

static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
}

static __device__ __forceinline__ float op_div(const float a, const float b) {
    return a / b;
}

template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s10,*/ int s11, int s12, int s13) {
    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;

    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
        return;
    }

    const int i11 = i1 % ne11;
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
    const size_t i_dst  = i_src0;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
    dst_t * dst_row = dst + i_dst;

    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
        const int i10 = i0 % ne10;
        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
    }
}

template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s10,*/ int s11, int s12, int s13) {

    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    const int i3 = i/(ne2*ne1*ne0);
    const int i2 = (i/(ne1*ne0)) % ne2;
    const int i1 = (i/ne0) % ne1;
    const int i0 = i % ne0;

    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
        return;
    }

    const int i11 = i1 % ne11;
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
    const size_t i_dst  = i_src0;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
    dst_t * dst_row = dst + i_dst;

    const int i10 = i0 % ne10;
    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}

static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
    const int ne10, const int ne11, const int ne12,
    const int nb1, const int nb2, int offset) {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
    }
    int src1_idx = i - offset;
    int oz = src1_idx / nb2;
    int oy = (src1_idx - (oz * nb2)) / nb1;
    int ox = src1_idx % nb1;
    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
    } else {
        dst[i] = x[i];
    }
}

static __global__ void gelu_f32(const float * x, float * dst, const int k) {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    float xi = x[i];
    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
}

static __global__ void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] / (1.0f + expf(-x[i]));
}

static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
    const float GELU_QUICK_COEF = -1.702f;
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
}

static __global__ void tanh_f32(const float * x, float * dst, int k) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = tanhf(x[i]);
}

static __global__ void relu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = fmaxf(x[i], 0);
}

static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
}

static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
}

static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
}

static __global__ void sqr_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] * x[i];
}

template <int block_size>
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    float2 mean_var = make_float2(0.f, 0.f);

    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        mean_var.x += xi;
        mean_var.y += xi * xi;
    }

    // sum up partial sums
    mean_var = warp_reduce_sum(mean_var);
    if (block_size > WARP_SIZE) {
        __shared__ float2 s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = mean_var;
        }
        __syncthreads();
        mean_var = s_sum[lane_id];
        mean_var = warp_reduce_sum(mean_var);
    }

    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
    const float inv_std = rsqrtf(var + eps);

    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
    }
}

static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    // operation
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    if (blockIdx.z < ne02) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
            blockIdx.z * ne0 * gridDim.y;
        dst[offset_dst] = x[offset_src];
    } else {
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
            (blockIdx.z - ne02) * ne0 *  gridDim.y;
        dst[offset_dst] = y[offset_src];
    }
}

static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
    // blockIdx.z: idx of ne02*ne03
    // blockIdx.y: idx of ne01*scale_factor， aka ne1
    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
    // ne00xne01: ne00 * ne01
    int ne0 = ne00 * scale_factor;
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    // operation
    int i00 = nidx / scale_factor;
    int i01 = blockIdx.y / scale_factor;
    int offset_src =
        i00 +
        i01 * ne00 +
        blockIdx.z * ne00xne01;
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    dst[offset_dst] = x[offset_src];
}

static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
    // blockIdx.y: idx of ne1
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }

    // operation
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
        int offset_src =
            nidx +
            blockIdx.y * ne00 +
            blockIdx.z * ne00 * ne01;
        dst[offset_dst] = x[offset_src];
    } else {
        dst[offset_dst] = 0.0f;
    }
}

static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    dst[nidx] = start + step * nidx;
}

static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
    // blockIDx.y: idx of timesteps->ne[0]
    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
    int i = blockIdx.y;
    int j = threadIdx.x + blockIdx.x * blockDim.x;
    float * embed_data = (float *)((char *)dst +  i*nb1);

    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
        embed_data[dim] = 0.f;
    }

    int half = dim / 2;
    if (j >= half) {
        return;
    }

    float timestep = timesteps[i];
    float freq = (float)expf(-logf(max_period) * j / half);
    float arg = timestep * freq;
    embed_data[j] = cosf(arg);
    embed_data[j + half] = sinf(arg);
}

template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    // blockIdx.x: num_groups idx
    // threadIdx.x: block_size idx
    int start = blockIdx.x * group_size;
    int end = start + group_size;

    start += threadIdx.x;

    if (end >= ne_elements) {
        end = ne_elements;
    }

    float tmp = 0.0f; // partial sum for thread in warp

    for (int j = start; j < end; j += block_size) {
        tmp += x[j];
    }

    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    float mean = tmp / group_size;
    tmp = 0.0f;

    for (int j = start; j < end; j += block_size) {
        float xi = x[j] - mean;
        dst[j] = xi;
        tmp += xi * xi;
    }

    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    float variance = tmp / group_size;
    float scale = rsqrtf(variance + eps);
    for (int j = start; j < end; j += block_size) {
        dst[j] *= scale;
    }
}

template <int block_size>
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    float tmp = 0.0f; // partial sum for thread in warp

    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }

    // sum up partial sums
    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);

    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
}

static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_0 * x = (const block_q4_0 *) vx;

    const dfloat d = x[ib].d;

    const int vui = x[ib].qs[iqs];

    v.x = vui & 0xF;
    v.y = vui >> 4;

#ifdef GGML_CUDA_F16
    v = __hsub2(v, {8.0f, 8.0f});
    v = __hmul2(v, {d, d});
#else
    v.x = (v.x - 8.0f) * d;
    v.y = (v.y - 8.0f) * d;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
    const dfloat m = __high2half(x[ib].dm);

    const int vui = x[ib].qs[iqs];

    v.x = vui & 0xF;
    v.y = vui >> 4;

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
    v = __hadd2(v, {m, m});
#else
    v.x = (v.x * d) + m;
    v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_0 * x = (const block_q5_0 *) vx;

    const dfloat d = x[ib].d;

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));

    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

#ifdef GGML_CUDA_F16
    v = __hsub2(v, {16.0f, 16.0f});
    v = __hmul2(v, {d, d});
#else
    v.x = (v.x - 16.0f) * d;
    v.y = (v.y - 16.0f) * d;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
    const dfloat m = __high2half(x[ib].dm);

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));

    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
    v = __hadd2(v, {m, m});
#else
    v.x = (v.x * d) + m;
    v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q8_0 * x = (const block_q8_0 *) vx;

    const dfloat d = x[ib].d;

    v.x = x[ib].qs[iqs + 0];
    v.y = x[ib].qs[iqs + 1];

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
#else
    v.x *= d;
    v.y *= d;
#endif // GGML_CUDA_F16
}

template<typename dst_t>
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

    const int i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }

    dst_t * y = yy + 256*i + 32*ir + 4*il;

    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
    const float d = __half2float(x->d);
    const float dm = -8*d;

    const uint8_t * q = x->qs + 4*il;

    for (int l = 0; l < 4; ++l) {
        y[l+ 0] = d * (q[l] & 0xF) + dm;
        y[l+16] = d * (q[l] >>  4) + dm;
    }
}

template<typename dst_t>
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

    const int i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }

    dst_t * y = yy + 256*i + 32*ir + 4*il;

    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
    const float2 d = __half22float2(x->dm);

    const uint8_t * q = x->qs + 4*il;

    for (int l = 0; l < 4; ++l) {
        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
        y[l+16] = d.x * (q[l] >>  4) + d.y;
    }
}

//================================== k-quants

template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;

    const uint8_t q = x[i].qs[32*n + l];
    dst_t * y = yy + i*QK_K + 128*n;

    float dall = __low2half(x[i].dm);
    float dmin = __high2half(x[i].dm);
    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
    const int is = tid/16;  // 0 or 1
    const int il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    dst_t * y = yy + i*QK_K + 16*is + il;
    float dall = __low2half(x[i].dm);
    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

#if QK_K == 256
    const int r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
    const int n = tid / 4;
    const int j = tid - 4*n;

    uint8_t m = 1 << (4*n + j);
    int is = 8*n + 2*j + is0;
    int shift = 2*j;

    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
    float d_all = x[i].d;
    float dl = d_all * (us - 32);

    dst_t * y = yy + i*QK_K + 128*n + 32*j;
    const uint8_t * q = x[i].qs + 32*n;
    const uint8_t * hm = x[i].hmask;

    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else
    const int tid = threadIdx.x;
    const int is  = tid/16;  // 0 or 1
    const int il  = tid%16;  // 0...15
    const int im  = il/8;    // 0...1
    const int in  = il%8;    // 0...7

    dst_t * y = yy + i*QK_K + 16*is + il;

    const uint8_t q = x[i].qs[il] >> (2*is);
    const uint8_t h = x[i].hmask[in] >> (2*is + im);
    const float   d = (float)x[i].d;

    if (is == 0) {
        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    } else {
        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    }
#endif

}

#if QK_K == 256
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
    if (j < 4) {
        d = q[j] & 63; m = q[j + 4] & 63;
    } else {
        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
}
#endif

template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

    const int i = blockIdx.x;

#if QK_K == 256
    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
    const int n   = 4;

    dst_t * y = yy + i*QK_K + 64*il + n*ir;

    const float dall = __low2half(x[i].dm);
    const float dmin = __high2half(x[i].dm);

    const uint8_t * q = x[i].qs + 32*il + n*ir;

    uint8_t sc, m;
    get_scale_min_k4(is + 0, x[i].scales, sc, m);
    const float d1 = dall * sc; const float m1 = dmin * m;
    get_scale_min_k4(is + 1, x[i].scales, sc, m);
    const float d2 = dall * sc; const float m2 = dmin * m;
    for (int l = 0; l < n; ++l) {
        y[l + 0] = d1 * (q[l] & 0xF) - m1;
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
#else
    const int tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    dst_t * y = yy + i*QK_K;
    const float d = (float)x[i].dm[0];
    const float m = (float)x[i].dm[1];
    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
#endif
}

template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

    const int i = blockIdx.x;

#if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6

    dst_t * y = yy + i*QK_K + 64*il + 2*ir;

    const float dall = __low2half(x[i].dm);
    const float dmin = __high2half(x[i].dm);

    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
    const uint8_t * qh = x[i].qh + 2*ir;

    uint8_t sc, m;
    get_scale_min_k4(is + 0, x[i].scales, sc, m);
    const float d1 = dall * sc; const float m1 = dmin * m;
    get_scale_min_k4(is + 1, x[i].scales, sc, m);
    const float d2 = dall * sc; const float m2 = dmin * m;

    uint8_t   hm  = 1 << (2*il);
    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
    hm <<= 1;
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
    const int tid = threadIdx.x;
    const uint8_t q = x[i].qs[tid];
    const int im = tid/8;  // 0...3
    const int in = tid%8;  // 0...7
    const int is = tid/16; // 0 or 1
    const uint8_t h = x[i].qh[in] >> im;
    const float d = x[i].d;
    dst_t * y = yy + i*QK_K + tid;
    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
#endif
}

template<typename dst_t>
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

    const int i = blockIdx.x;
#if QK_K == 256

    // assume 64 threads - this is very slightly better than the one below
    const int tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;

    dst_t * y = yy + i*QK_K + 128*ip + il;

    const float d = x[i].d;

    const uint8_t * ql = x[i].ql + 64*ip + il;
    const uint8_t   qh = x[i].qh[32*ip + il];
    const int8_t  * sc = x[i].scales + is;

    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
#else

    // assume 32 threads
    const int tid = threadIdx.x;
    const int ip  = tid/16;         // 0 or 1
    const int il  = tid - 16*ip;    // 0...15

    dst_t * y = yy + i*QK_K + 16*ip + il;

    const float d = x[i].d;

    const uint8_t   ql = x[i].ql[16*ip + il];
    const uint8_t   qh = x[i].qh[il] >> (2*ip);
    const int8_t  * sc = x[i].scales;

    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
#endif
}

inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return true;
        default:
            return false;
    }
}

template<typename dst_t>
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t  * q3 = x[i].qs + 8*ib;
    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
    const uint32_t aux32 = gas[0] | (gas[1] << 16);
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 4; ++j) {
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * qs = x[i].qs + 8*ib;
    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
    const uint8_t signs = x[i].signs[4*ib + il];
    for (int j = 0; j < 4; ++j) {
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq1_s * x = (const block_iq1_s  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
#else
    assert(false);
#endif

}

static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

template<typename dst_t>
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

    const int tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[ib].qs + 4*il;
    const float d = (float)x[ib].d;
    for (int j = 0; j < 4; ++j) {
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }

}

#if QK_K != 64
template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const int i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

    const int tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
    for (int j = 0; j < 4; ++j) {
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
}
#endif

static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q2_K * x = (const block_q2_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int step = 16/K_QUANTS_PER_ITERATION;

    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7

    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
    const int q_offset = 32*im + l0;
    const int s_offset = 8*im;
    const int y_offset = 128*im + l0;

    uint32_t aux[4];
    const uint8_t * d = (const uint8_t *)aux;
    const uint8_t * m = (const uint8_t *)(aux + 2);

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
        aux[1] = a[1] & 0x0f0f0f0f;
        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;

        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];

        }
        tmp += dall * sum1 - dmin * sum2;

    }
#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;

    uint32_t uaux[2];
    const uint8_t * d = (const uint8_t *)uaux;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint32_t * s = (const uint32_t *)x[i].scales;

        uaux[0] = s[0] & 0x0f0f0f0f;
        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;

        const float2 dall = __half22float2(x[i].dm);

        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t ql = q[l];
            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
                  + y[l+16] * d[1] * ((ql >> 2) & 3)
                  + y[l+32] * d[2] * ((ql >> 4) & 3)
                  + y[l+48] * d[3] * ((ql >> 6) & 3);
            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
        }
        tmp += dall.x * sum1 - dall.y * sum2;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q3_K * x = (const block_q3_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256

    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
    const int step = 16/K_QUANTS_PER_ITERATION;
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0....15 or 0...7

    const uint8_t m = 1 << (4*im);

    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
    const int q_offset =  32*im + l0;
    const int y_offset = 128*im + l0;

    uint16_t utmp[4];
    const int8_t * s = (const int8_t *)utmp;

    const uint16_t s_shift = 4*im;

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
        const uint8_t * h = x[i].hmask + l0;

        const uint16_t * a = (const uint16_t *)x[i].scales;
        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);

        const float d = x[i].d;

        float sum = 0;
        for (int l = 0; l < n; ++l) {
            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
        }
        tmp += d * sum;

    }
#else

    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
    const int in = offset/8;                                 // 0 or 1
    const int im = offset%8;                                 // 0...7

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint8_t * s = x[i].scales;

        const float dall = (float)x[i].d;

        float sum = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t hl = x[i].hmask[im+l] >> in;
            const uint8_t ql = q[l];
            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
        }
        tmp += sum;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q4_K * x = (const block_q4_K *)vx + ib0;

#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4

    const int il  = tid/step;                            // 0...3
    const int ir  = tid - step*il;                       // 0...7 or 0...3
    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4

    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;

    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;

#if K_QUANTS_PER_ITERATION == 2
    uint32_t q32[4];
    const uint8_t * q4 = (const uint8_t *)q32;
#else
    uint16_t q16[4];
    const uint8_t * q4 = (const uint8_t *)q16;
#endif

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

#if K_QUANTS_PER_ITERATION == 2
        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
        const uint32_t * q2 = q1 + 16;

        q32[0] = q1[0] & 0x0f0f0f0f;
        q32[1] = q1[0] & 0xf0f0f0f0;
        q32[2] = q2[0] & 0x0f0f0f0f;
        q32[3] = q2[0] & 0xf0f0f0f0;

        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 4; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#else
        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
        const uint16_t * q2 = q1 + 32;

        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[0] & 0xf0f0;
        q16[2] = q2[0] & 0x0f0f;
        q16[3] = q2[0] & 0xf0f0;

        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 2; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#endif

    }
#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);

    const int step = tid * K_QUANTS_PER_ITERATION;

    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;

    float tmp = 0;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const float   * y = yy + i*QK_K + step;
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
        const float d = (float)x[i].dm[0];
        const float m = (float)x[i].dm[1];
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
        }
        tmp += sum;
    }

#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {

    const int row = blockIdx.x;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q5_K * x = (const block_q5_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = threadIdx.x/2;  // 0...15
    const int ix  = threadIdx.x%2;

    const int il  = tid/4;     // 0...3
    const int ir  = tid - 4*il;// 0...3
    const int n   = 2;

    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;

    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

    const uint8_t hm1  = 1 << (2*im);
    const uint8_t hm2  = hm1 << 4;

    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;

    uint16_t q16[8];
    const uint8_t * q4 = (const uint8_t *)q16;

    for (int i = ix; i < num_blocks_per_row; i += 2) {

        const uint8_t * ql1 = x[i].qs + q_offset;
        const uint8_t * qh  = x[i].qh + l0;
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

        float4 sum = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        const uint16_t * q1 = (const uint16_t *)ql1;
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[8] & 0x0f0f;
        q16[2] = (q1[0] >> 4) & 0x0f0f;
        q16[3] = (q1[8] >> 4) & 0x0f0f;
        q16[4] = q2[0] & 0x0f0f;
        q16[5] = q2[8] & 0x0f0f;
        q16[6] = (q2[0] >> 4) & 0x0f0f;
        q16[7] = (q2[8] >> 4) & 0x0f0f;
        for (int l = 0; l < n; ++l) {
            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
        }
        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
    }

#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
    const int step = tid * K_QUANTS_PER_ITERATION;
    const int im = step/8;
    const int in = step%8;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const int8_t  * s = x[i].scales;
        const float   * y = yy + i*QK_K + step;
        const float     d = x[i].d;
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            const uint8_t h = x[i].qh[in+j] >> im;
            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
        }
        tmp += sum;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q6_K * x = (const block_q6_K *)vx + ib0;

#if QK_K == 256

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1

    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8

    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7

#if K_QUANTS_PER_ITERATION == 1
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
    const int is = 0;
#else
    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
    const int is = in / 4;
#endif
    const int ql_offset = 64*im + l0;
    const int qh_offset = 32*im + l0;
    const int s_offset  =  8*im + is;
    const int y_offset = 128*im + l0;

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * ql = x[i].ql + ql_offset;
        const uint8_t * qh = x[i].qh + qh_offset;
        const int8_t  * s  = x[i].scales + s_offset;

        const float d = x[i].d;

#if K_QUANTS_PER_ITERATION == 1
        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
        tmp += sum;
#else
        float sum = 0;
        for (int l = 0; l < 4; ++l) {
            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
        }
        tmp += sum;
#endif

    }

#else

    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3

    const int step = tid * K_QUANTS_PER_ITERATION;

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + step;
        const uint8_t * ql = x[i].ql + step;
        const uint8_t * qh = x[i].qh + step;
        const int8_t  * s  = x[i].scales;

        const float d = x[i+0].d;

        float sum = 0;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
        }
        tmp += sum;

    }

#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
        dst[row] = tmp;
    }
}

static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const half * x = (const half *) vx;

    // automatic half -> float type cast if dfloat == float
    v.x = x[ib + iqs + 0];
    v.y = x[ib + iqs + 1];
}

static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
    const int ix = blockDim.x*blockIdx.x + threadIdx.x;

    if (ix >= kx_padded) {
        return;
    }

    const int iy = blockDim.y*blockIdx.y + threadIdx.y;

    const int i_padded = iy*kx_padded + ix;

    block_q8_1 * y = (block_q8_1 *) vy;

    const int ib = i_padded / QK8_1; // block index
    const int iqs = i_padded % QK8_1; // quant index

    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
    float amax = fabsf(xi);
    float sum = xi;

    amax = warp_reduce_max(amax);
    sum = warp_reduce_sum(sum);

    const float d = amax / 127;
    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);

    y[ib].qs[iqs] = q;

    if (iqs > 0) {
        return;
    }

    reinterpret_cast<half&>(y[ib].ds.x) = d;
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
}

template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
            const void * src0, const int32_t * src1, dst_t * dst,
            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
    }

    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;

    const int ib = i00/qk; // block index
    const int iqs = (i00%qk)/qr; // quant index
    const int iybs = i00 - i00%qk; // dst block start index
    const int y_offset = qr == 1 ? 1 : qk/2;

    // dequantize
    dfloat2 v;
    dequantize_kernel(src0_row, ib, iqs, v);

    dst_row[iybs + iqs + 0]        = v.x;
    dst_row[iybs + iqs + y_offset] = v.y;
}

template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
            const src0_t * src0, const int32_t * src1, dst_t * dst,
            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
    }

    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);

    dst_row[i00] = src0_row[i00];
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);

    if (i >= k) {
        return;
    }

    const int ib = i/qk; // block index
    const int iqs = (i%qk)/qr; // quant index
    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;

    // dequantize
    dfloat2 v;
    dequantize_kernel(vx, ib, iqs, v);

    y[iybs + iqs + 0]        = v.x;
    y[iybs + iqs + y_offset] = v.y;
}

template <typename src_t, typename dst_t>
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    const src_t * x = (src_t *) vx;

    y[i] = x[i];
}

template <bool need_check>
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
#if __CUDA_ARCH__ >= CC_PASCAL
    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
    const int * x0 = ((int *) vx) + blockIdx.x * nint;
    half2 * y2 = (half2 *) (y + i0);

    __shared__ int vals[nint];

#pragma unroll
    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
            break;
        }

        const int ix = ix0 + threadIdx.x;
        vals[ix] = x0[ix];
    }

#pragma unroll
    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
            return;
        }

        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
        const half    d = *b0;
        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];

        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
    }
#else
    GGML_UNUSED(vx);
    GGML_UNUSED(y);
    GGML_UNUSED(k);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL
}

// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
    const int * v, const int * u, const float & d4, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

        // SIMD dot product of quantized values
        sumi = __dp4a(vi0, u[2*i+0], sumi);
        sumi = __dp4a(vi1, u[2*i+1], sumi);
    }

    const float2 ds8f = __half22float2(ds8);

    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q4_1_Q8_1_MMVQ 2
#define VDR_Q4_1_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

        // SIMD dot product of quantized values
        sumi = __dp4a(vi0, u[2*i+0], sumi);
        sumi = __dp4a(vi1, u[2*i+1], sumi);
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
    const float d4d8 = tmp.x;
    const float m4s8 = tmp.y;
#else
    const float2 dm4f = __half22float2(dm4);
    const float2 ds8f = __half22float2(ds8);
    const float d4d8 = dm4f.x * ds8f.x;
    const float m4s8 = dm4f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_0_Q8_1_MMVQ 2
#define VDR_Q5_0_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
    }

    const float2 ds8f = __half22float2(ds8);

    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_1_Q8_1_MMVQ 2
#define VDR_Q5_1_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
    const float d5d8 = tmp.x;
    const float m5s8 = tmp.y;
#else
    const float2 dm5f = __half22float2(dm5);
    const float2 ds8f = __half22float2(ds8);
    const float d5d8 = dm5f.x * ds8f.x;
    const float m5s8 = dm5f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q8_0_Q8_1_MMVQ 2
#define VDR_Q8_0_Q8_1_MMQ 8

template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
    const int * v, const int * u, const float & d8_0, const float & d8_1) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        // SIMD dot product of quantized values
        sumi = __dp4a(v[i], u[i], sumi);
    }

    return d8_0*d8_1 * sumi;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        // SIMD dot product of quantized values
        sumi = __dp4a(v[i], u[i], sumi);
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
    const float d8d8 = tmp.x;
    const float m8s8 = tmp.y;
#else
    const float2 dm8f = __half22float2(dm8);
    const float2 ds8f = __half22float2(ds8);
    const float d8d8 = dm8f.x * ds8f.x;
    const float m8s8 = dm8f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q2_K_Q8_1_MMVQ 1
#define VDR_Q2_K_Q8_1_MMQ  2

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const half2 & dm2, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR2_K; ++i) {
        const int sc = scales[2*i];

        const int vi = (v >> (2*i)) & 0x03030303;

        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product

        // fill int with 4x m
        int m = sc >> 4;
        m |= m <<  8;
        m |= m << 16;
        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
    }

    const float2 dm2f = __half22float2(dm2);

    return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const half2 & dm2, const float & d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi_d = 0;
    int sumi_m = 0;

#pragma unroll
    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
        int sumi_d_sc = 0;

        const int sc = scales[i0 / (QI8_1/2)];

        // fill int with 4x m
        int m = sc >> 4;
        m |= m <<  8;
        m |= m << 16;

#pragma unroll
        for (int i = i0; i < i0 + QI8_1/2; ++i) {
            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
        }

        sumi_d += sumi_d_sc * (sc & 0xF);
    }

    const float2 dm2f = __half22float2(dm2);

    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q3_K_Q8_1_MMVQ 1
#define VDR_Q3_K_Q8_1_MMQ  2

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const int & scale_offset, const float & d3, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf = 0.0f;

#pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        const int isc = scale_offset + 2*i;

        const int isc_low = isc % (QK_K/32);
        const int sc_shift_low = 4 * (isc / (QK_K/32));
        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;

        const int isc_high = isc % (QK_K/64);
        const int sc_shift_high = 2 * (isc / (QK_K/64));
        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;

        const int sc = (sc_low | sc_high) - 32;

        const int vil = (vl >> (2*i)) & 0x03030303;

        const int vih = ((vh >> i) << 2) & 0x04040404;

        const int vi = __vsubss4(vil, vih);

        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
    }

    return d3 * sumf;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
    const float & d3, const float & d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
        int sumi_sc = 0;

        for (int i = i0; i < i0 + QI8_1/2; ++i) {
            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
        }

        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
    }

    return d3*d8 * sumi;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q4_K_Q8_1_MMVQ 2
#define VDR_Q4_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR4_K; ++i) {
        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;

        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u

        sumf_d += d8[i] * (dot1 * sc[i]);
        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
        int sumi_d = 0;

#pragma unroll
        for (int j = 0; j < QI8_1; ++j) {
            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
        }

        const float2 ds8f = __half22float2(ds8[i]);

        sumf_d += ds8f.x * (sc[i] * sumi_d);
        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_K_Q8_1_MMVQ 2
#define VDR_Q5_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;

        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;

        const int v0i = vl0i | vh0i;
        const int v1i = vl1i | vh1i;

        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u

        sumf_d += d8[i] * (dot1 * sc[i]);
        sumf_m += d8[i] * (dot2 * m[i]);

    }

    const float2 dm5f = __half22float2(dm5);

    return dm5f.x*sumf_d - dm5f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
        int sumi_d = 0;

#pragma unroll
        for (int j = 0; j < QI8_1; ++j) {
            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
        }

        const float2 ds8f = __half22float2(ds8[i]);

        sumf_d += ds8f.x * (sc[i] * sumi_d);
        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q6_K_Q8_1_MMVQ 1
#define VDR_Q6_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
    const float & d, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf = 0.0f;

#pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        const int sc = scales[4*i];

        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;

        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;

        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32

        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
    }

    return d*sumf;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
    const float & d6, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;

#pragma unroll
    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale

#pragma unroll
        for (int i = i0; i < i0 + 2; ++i) {
            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product

            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
        }

        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
    }

    return d6 * sumf_d;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

    int v[VDR_Q4_0_Q8_1_MMVQ];
    int u[2*VDR_Q4_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
    }

    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh);
    GGML_UNUSED(x_sc);

    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];

    *x_ql = tile_x_qs;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_0;
    const int kqsx = k % QI4_0;

    const block_q4_0 * bx0 = (const block_q4_0 *) vx;

    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const float * x_dmf = (const float *) x_dm;

    int u[2*VDR_Q4_0_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
    }

    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;

    int v[VDR_Q4_1_Q8_1_MMVQ];
    int u[2*VDR_Q4_1_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
    }

    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];

    *x_ql = tile_x_qs;
    *x_dm = tile_x_dm;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_1;
    const int kqsx = k % QI4_1;

    const block_q4_1 * bx0 = (const block_q4_1 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
    }
}

static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));

    int u[2*VDR_Q4_1_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
    }

    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;

    int vl[VDR_Q5_0_Q8_1_MMVQ];
    int vh[VDR_Q5_0_Q8_1_MMVQ];
    int  u[2*VDR_Q5_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
    }

    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];

    *x_ql = tile_x_ql;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_0;
    const int kqsx = k % QI5_0;

    const block_q5_0 * bx0 = (const block_q5_0 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;

        const int ql = get_int_from_uint8(bxi->qs, kqsx);
        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));

        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;

        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
    const int kbxd = k % blocks_per_tile_x_row;
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    int u[2*VDR_Q5_0_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
    }

    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;

    int vl[VDR_Q5_1_Q8_1_MMVQ];
    int vh[VDR_Q5_1_Q8_1_MMVQ];
    int  u[2*VDR_Q5_1_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
    }

    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset < nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_1;
    const int kqsx = k % QI5_1;

    const block_q5_1 * bx0 = (const block_q5_1 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;

        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));

        int qs0 = (ql >>  0) & 0x0F0F0F0F;
        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;

        int qs1 = (ql >>  4) & 0x0F0F0F0F;
        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
    }
}

static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;

    int u[2*VDR_Q5_1_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
    }

    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;

    int v[VDR_Q8_0_Q8_1_MMVQ];
    int u[VDR_Q8_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
    }

    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];

    *x_ql = tile_x_qs;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI8_0;
    const int kqsx = k % QI8_0;
    float * x_dmf = (float *) x_dm;

    const block_q8_0 * bx0 = (const block_q8_0 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);

    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q2_K * bq2_K = (const block_q2_K *) vbq;

    const int bq8_offset = QR2_K * (iqs / QI8_1);
    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

    const uint8_t * scales = bq2_K->scales + scale_offset;

    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
    int    u[QR2_K];
    float d8[QR2_K];

#pragma unroll
    for (int i = 0; i < QR2_K; ++ i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh);

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI2_K;
    const int kqsx = k % QI2_K;

    const block_q2_K * bx0 = (const block_q2_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);

        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
    }
}

static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh);

    const int kbx = k / QI2_K;
    const int ky  = (k % QI2_K) * QR2_K;
    const float * y_df = (const float *) y_ds;

    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];

    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));

#pragma unroll
    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
    }

    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;

    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q3_K * bq3_K = (const block_q3_K *) vbq;

    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

    const float d = bq3_K->d;

    const int vl = get_int_from_uint8(bq3_K->qs, iqs);

    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;

    int    u[QR3_K];
    float d8[QR3_K];

#pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_qh = tile_x_qh;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI3_K;
    const int kqsx = k % QI3_K;

    const block_q3_K * bx0 = (const block_q3_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
    const int kbxd = k % blocks_per_tile_x_row;
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);

        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);

        const int ksc = k % (QI3_K/4);

        const int ksc_low = ksc % (QI3_K/8);
        const int shift_low = 4 * (ksc / (QI3_K/8));
        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;

        const int ksc_high = QI3_K/8;
        const int shift_high = 2 * ksc;
        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;

        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);

        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
    }
}

static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {

    const int kbx  = k / QI3_K;
    const int ky  = (k % QI3_K) * QR3_K;
    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;

    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
        const int shift = 2 * ((ky % 32) / 8);
        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;

        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
        const int vlh = (vh << 2) & 0x04040404;

        v[l] = __vsubss4(vll, vlh);
    }

    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#ifndef GGML_QKK_64
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    int    v[2];
    int    u[2*QR4_K];
    float d8[QR4_K];

    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));

    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108

    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
    v[0] = q4[0];
    v[1] = q4[4];

    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;

    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        d8[i] = __low2float(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
        u[2*i+1] = q8[4];
    }

    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);

#else

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;

    const uint16_t * a = (const uint16_t *)bq4_K->scales;
    aux16[0] = a[0] & 0x0f0f;
    aux16[1] = (a[0] >> 4) & 0x0f0f;

    const float dall = bq4_K->dm[0];
    const float dmin = bq4_K->dm[1];

    const float d8_1 = __low2float(bq8_1[0].ds);
    const float d8_2 = __low2float(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
    const int v1 = q4[0];
    const int v2 = q4[4];

    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));

    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);

    return dall * sumf_d - dmin * sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

#endif
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh);

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
    const int kqsx = k % QI4_K; // == k if QK_K == 256

    const block_q4_K * bx0 = (const block_q4_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;

#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
#else
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
#endif
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);

        const int * scales = (const int *) bxi->scales;

        const int ksc = k % (WARP_SIZE/8);

        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits

        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
    }
}

static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh);

    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);

    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#ifndef GGML_QKK_64
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    int   vl[2];
    int   vh[2];
    int    u[2*QR5_K];
    float d8[QR5_K];

    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));

    vl[0] = ql[0];
    vl[1] = ql[4];

    vh[0] = qh[0] >> bq8_offset;
    vh[1] = qh[4] >> bq8_offset;

    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;

#pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        d8[i] = __low2float(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
        u[2*i+1] = q8[4];
    }

    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);

#else

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    const int8_t * s = bq5_K->scales;

    const float d = bq5_K->d;

    const float d8_1 = __low2half(bq8_1[0].ds);
    const float d8_2 = __low2half(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

    const int * ql = (const int *)bq5_K->qs + (iqs/2);
    const int vl1 = ql[0];
    const int vl2 = ql[4];

    const int step = 4 * (iqs/2); // 0, 4, 8, 12
    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
    const int in = step%8; // 0, 4, 0, 4
    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;

    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);

    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);

    return d * sumf_d;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

#endif
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh);

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
    const int kqsx = k % QI5_K; // == k if QK_K == 256

    const block_q5_K * bx0 = (const block_q5_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
        const int ky = QR5_K*kqsx;

        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
        const int ql1 = (ql >> 4) & 0x0F0F0F0F;

        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;

        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);

        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;

#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
#endif
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);

        const int * scales = (const int *) bxi->scales;

        const int ksc = k % (WARP_SIZE/8);

        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits

        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
    }
}

static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh);

    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);

    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q6_K * bq6_K = (const block_q6_K *) vbq;

    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));

    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;

    const int8_t * scales = bq6_K->scales + scale_offset;

    int    u[QR6_K];
    float d8[QR6_K];

#pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
    }

    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    GGML_UNUSED(x_qh);

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    GGML_UNUSED(x_qh);

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
    const int kqsx = k % QI6_K; // == k if QK_K == 256

    const block_q6_K * bx0 = (const block_q6_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
        const int ky = QR6_K*kqsx;

        const int ql = get_int_from_uint8(bxi->ql, kqsx);
        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
        const int ql1 = (ql >> 4) & 0x0F0F0F0F;

        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;

        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);

        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;

        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
    }
}

static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    GGML_UNUSED(x_qh);

    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);

    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if QK_K == 256
    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

#if QR2_XXS == 8
    const int ib32 = iqs;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const int8_t   * q8 = bq8_1[ib32].qs;
    uint32_t aux32 = q2[2] | (q2[3] << 16);
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
        for (int j = 0; j < 8; ++j) {
            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
        }
        q8 += 8;
        aux32 >>= 7;
    }
    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * sumi;
#else
    // iqs is 0...15
    const int ib32 = iqs/2;
    const int il = iqs%2;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
    int sumi1 = 0, sumi2 = 0;
    for (int j = 0; j < 8; ++j) {
        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
    }
    return d * (sumi1 + sumi2);
#endif
#else
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

    const int ib32 = iqs;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
    const uint8_t ls2 = bq2->scales[ib32] >>  4;
    int sumi1 = 0;
    for (int l = 0; l < 2; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
        q8 += 8;
    }
    int sumi2 = 0;
    for (int l = 2; l < 4; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
        q8 += 8;
    }
    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
#else
    GGML_UNUSED(ksigns64);
    assert(false);
    return 0.f;
#endif
#else
    GGML_UNUSED(ksigns64);
    assert(false);
    return 0.f;
#endif
}

// TODO
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

    const int ib32 = iqs;
    const int8_t  * q8 = bq8_1[ib32].qs;
    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
    const uint8_t ls2 = bq2->scales[ib32] >>  4;
    int sumi1 = 0;
    for (int l = 0; l < 2; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
        q8 += 8;
    }
    int sumi2 = 0;
    for (int l = 2; l < 4; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
        q8 += 8;
    }
    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
#else
    GGML_UNUSED(ksigns64);
    assert(false);
    return 0.f;
#endif
#else
    GGML_UNUSED(ksigns64);
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

    const int ib32 = iqs;
    const uint8_t  * q3 = bq2->qs + 8*ib32;
    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    uint32_t aux32 = gas[0] | (gas[1] << 16);
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
        q8 += 8;
        aux32 >>= 7;
    }
    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
    return d * sumi;
#else
    assert(false);
    return 0.f;
#endif
#else
    assert(false);
    return 0.f;
#endif
}

// TODO: don't use lookup table for signs
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

    const int ib32 = iqs;
    const uint8_t  * qs = bq2->qs + 8*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
        q8 += 8;
    }
    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
    return d * sumi;
#else
    assert(false);
    return 0.f;
#endif
#else
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if QK_K == 256
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

    const int ib32 = iqs;
    int sumi = 0;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const int * q8 = (const int *)bq8_1[ib32].qs;
    for (int l = 0; l < 4; ++l) {
        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
        int grid0 = grid[0] & 0x0f0f0f0f;
        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
    }
#else
    const int8_t * q8 = bq8_1[ib32].qs;
    for (int l = 0; l < 4; ++l) {
        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
        for (int j = 0; j < 4; ++j) {
            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
        }
        q8 += 8;
    }
#endif
    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
    const float d = d1q * __low2float (bq8_1[ib32].ds);
    const float m = d1q * __high2float(bq8_1[ib32].ds);
    return d * sumi + m * delta;
#else
    assert(false);
    return 0.f;
#endif
}

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
        int & val1, int & val2) {

    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
    aux32 = q4 & 0x0f0f0f0f;
    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
    val1 = v1 | (v2 << 16);
    aux32 = (q4 >> 4) & 0x0f0f0f0f;
    v1 = values[q8[0]] | (values[q8[1]] << 8);
    v2 = values[q8[2]] | (values[q8[3]] << 8);
    val2 = v1 | (v2 << 16);
}
#endif

static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;

    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

    int v1, v2;
    int sumi1 = 0, sumi2 = 0;
    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
        get_int_from_table_16(aux, values, v1, v2);
        sumi1 = __dp4a(v1, q8[l+0], sumi1);
        sumi2 = __dp4a(v2, q8[l+4], sumi2);
    }

#else
    const uint8_t * q4 = bq->qs + 4*iqs;
    const int8_t  * q8 = bq8_1->qs + 4*iqs;

    int sumi1 = 0, sumi2 = 0;
    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
    }
#endif
    const float d = (float)bq->d * __low2float(bq8_1->ds);
    return d * (sumi1 + sumi2);
}

static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#if QK_K == 256
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

    //// iqs is 0...7
    //const int ib64 = iqs/2;
    //const int il = iqs%2;
    //const int32_t  * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
    //const int32_t  * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
    //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
    //const uint32_t * q4_2 = q4_1 + 4;
    //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
    //const int8_t ls2 = (bq4->scales_l[ib64] >>  4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
    //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
    //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
    //int v1, v2;
    //int sumi1 = 0, sumi2 = 0;
    //for (int j = 0; j < 2; ++j) {
    //    get_int_from_table_16(q4_1[j], values, v1, v2);
    //    sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
    //    get_int_from_table_16(q4_2[j], values, v1, v2);
    //    sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
    //}
    //return d1 * sumi1 + d2 * sumi2;

    // iqs is 0...7
    const int ib32 = iqs;
    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
    int v1, v2;
    int sumi1 = 0, sumi2 = 0;
    for (int j = 0; j < 4; ++j) {
        get_int_from_table_16(q4[j], values, v1, v2);
        sumi1 = __dp4a(v1, q8[j+0], sumi1);
        sumi2 = __dp4a(v2, q8[j+4], sumi2);
    }
    return d * (sumi1 + sumi2);

    //// iqs is 0...15
    //const int ib32 = iqs/2;
    //const int il = iqs%2;
    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs + 2*il;
    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
    //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
    //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
    //int v1, v2;
    //int sumi1 = 0, sumi2 = 0;
    //for (int j = 0; j < 2; ++j) {
    //    get_int_from_table_16(q4[j], values, v1, v2);
    //    sumi1 = __dp4a(v1, q8[j+0], sumi1);
    //    sumi2 = __dp4a(v2, q8[j+4], sumi2);
    //}
    //return d * (sumi1 + sumi2);
#else
    assert(false);
    return 0.f;
#endif
#else
    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
#endif
}

template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
static __device__ __forceinline__ void mul_mat_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

    const int blocks_per_row_x = ncols_x / qk;
    const int blocks_per_col_y = nrows_y / QK8_1;
    const int blocks_per_warp = WARP_SIZE / qi;

    const int & ncols_dst = ncols_y;

    const int row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

    const int col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
    half2 * tile_x_dm = nullptr;
    int   * tile_x_qh = nullptr;
    int   * tile_x_sc = nullptr;

    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);

    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];

    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};

    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {

        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);

#pragma unroll
        for (int ir = 0; ir < qr; ++ir) {
            const int kqs = ir*WARP_SIZE + threadIdx.x;
            const int kbxd = kqs / QI8_1;

#pragma unroll
            for (int i = 0; i < mmq_x; i += nwarps) {
                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses

                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];

                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
            }

#pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
                if (need_sum) {
                    *dsi_dst = *dsi_src;
                } else {
                    float * dfi_dst = (float *) dsi_dst;
                    *dfi_dst = __low2float(*dsi_src);
                }
            }

            __syncthreads();

// #pragma unroll // unrolling this loop causes too much register pressure
            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
#pragma unroll
                for (int j = 0; j < mmq_x; j += nwarps) {
#pragma unroll
                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
                            threadIdx.x + i, threadIdx.y + j, k);
                    }
                }
            }

            __syncthreads();
        }
    }

#pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
        const int col_dst = col_dst_0 + j + threadIdx.y;

        if (col_dst >= ncols_dst) {
            return;
        }

#pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
            const int row_dst = row_dst_0 + threadIdx.x + i;

            if (row_dst >= nrows_dst) {
                continue;
            }

            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
        }
    }
}

#define  MMQ_X_Q4_0_RDNA2  64
#define  MMQ_Y_Q4_0_RDNA2  128
#define NWARPS_Q4_0_RDNA2  8
#define  MMQ_X_Q4_0_RDNA1  64
#define  MMQ_Y_Q4_0_RDNA1  64
#define NWARPS_Q4_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_0_AMPERE 4
#define  MMQ_Y_Q4_0_AMPERE 32
#define NWARPS_Q4_0_AMPERE 4
#else
#define  MMQ_X_Q4_0_AMPERE 64
#define  MMQ_Y_Q4_0_AMPERE 128
#define NWARPS_Q4_0_AMPERE 4
#endif
#define  MMQ_X_Q4_0_PASCAL 64
#define  MMQ_Y_Q4_0_PASCAL 64
#define NWARPS_Q4_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q4_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
    const int nwarps = NWARPS_Q4_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
    const int nwarps = NWARPS_Q4_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
    const int nwarps = NWARPS_Q4_0_AMPERE;

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
    const int nwarps = NWARPS_Q4_0_PASCAL;

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q4_1_RDNA2  64
#define  MMQ_Y_Q4_1_RDNA2  128
#define NWARPS_Q4_1_RDNA2  8
#define  MMQ_X_Q4_1_RDNA1  64
#define  MMQ_Y_Q4_1_RDNA1  64
#define NWARPS_Q4_1_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_1_AMPERE 4
#define  MMQ_Y_Q4_1_AMPERE 32
#define NWARPS_Q4_1_AMPERE 4
#else
#define  MMQ_X_Q4_1_AMPERE 64
#define  MMQ_Y_Q4_1_AMPERE 128
#define NWARPS_Q4_1_AMPERE 4
#endif
#define  MMQ_X_Q4_1_PASCAL 64
#define  MMQ_Y_Q4_1_PASCAL 64
#define NWARPS_Q4_1_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q4_1(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
    const int nwarps = NWARPS_Q4_1_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
    const int nwarps = NWARPS_Q4_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
    const int nwarps = NWARPS_Q4_1_AMPERE;

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
    const int nwarps = NWARPS_Q4_1_PASCAL;

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_0_RDNA2  64
#define  MMQ_Y_Q5_0_RDNA2  128
#define NWARPS_Q5_0_RDNA2  8
#define  MMQ_X_Q5_0_RDNA1  64
#define  MMQ_Y_Q5_0_RDNA1  64
#define NWARPS_Q5_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_0_AMPERE 4
#define  MMQ_Y_Q5_0_AMPERE 32
#define NWARPS_Q5_0_AMPERE 4
#else
#define  MMQ_X_Q5_0_AMPERE 128
#define  MMQ_Y_Q5_0_AMPERE 64
#define NWARPS_Q5_0_AMPERE 4
#endif
#define  MMQ_X_Q5_0_PASCAL 64
#define  MMQ_Y_Q5_0_PASCAL 64
#define NWARPS_Q5_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q5_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
    const int nwarps = NWARPS_Q5_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
    const int nwarps = NWARPS_Q5_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
    const int nwarps = NWARPS_Q5_0_AMPERE;

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
    const int nwarps = NWARPS_Q5_0_PASCAL;

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_1_RDNA2  64
#define  MMQ_Y_Q5_1_RDNA2  128
#define NWARPS_Q5_1_RDNA2  8
#define  MMQ_X_Q5_1_RDNA1  64
#define  MMQ_Y_Q5_1_RDNA1  64
#define NWARPS_Q5_1_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_1_AMPERE 4
#define  MMQ_Y_Q5_1_AMPERE 32
#define NWARPS_Q5_1_AMPERE 4
#else
#define  MMQ_X_Q5_1_AMPERE 128
#define  MMQ_Y_Q5_1_AMPERE 64
#define NWARPS_Q5_1_AMPERE 4
#endif
#define  MMQ_X_Q5_1_PASCAL 64
#define  MMQ_Y_Q5_1_PASCAL 64
#define NWARPS_Q5_1_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_1(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
    const int nwarps = NWARPS_Q5_1_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
    const int nwarps = NWARPS_Q5_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
    const int nwarps = NWARPS_Q5_1_AMPERE;

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
    const int nwarps = NWARPS_Q5_1_PASCAL;

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q8_0_RDNA2  64
#define  MMQ_Y_Q8_0_RDNA2  128
#define NWARPS_Q8_0_RDNA2  8
#define  MMQ_X_Q8_0_RDNA1  64
#define  MMQ_Y_Q8_0_RDNA1  64
#define NWARPS_Q8_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q8_0_AMPERE 4
#define  MMQ_Y_Q8_0_AMPERE 32
#define NWARPS_Q8_0_AMPERE 4
#else
#define  MMQ_X_Q8_0_AMPERE 128
#define  MMQ_Y_Q8_0_AMPERE 64
#define NWARPS_Q8_0_AMPERE 4
#endif
#define  MMQ_X_Q8_0_PASCAL 64
#define  MMQ_Y_Q8_0_PASCAL 64
#define NWARPS_Q8_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q8_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
    const int nwarps = NWARPS_Q8_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
    const int nwarps = NWARPS_Q8_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
    const int nwarps = NWARPS_Q8_0_AMPERE;

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
    const int nwarps = NWARPS_Q8_0_PASCAL;

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q2_K_RDNA2  64
#define  MMQ_Y_Q2_K_RDNA2  128
#define NWARPS_Q2_K_RDNA2  8
#define  MMQ_X_Q2_K_RDNA1  128
#define  MMQ_Y_Q2_K_RDNA1  32
#define NWARPS_Q2_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q2_K_AMPERE 4
#define  MMQ_Y_Q2_K_AMPERE 32
#define NWARPS_Q2_K_AMPERE 4
#else
#define  MMQ_X_Q2_K_AMPERE 64
#define  MMQ_Y_Q2_K_AMPERE 128
#define NWARPS_Q2_K_AMPERE 4
#endif
#define  MMQ_X_Q2_K_PASCAL 64
#define  MMQ_Y_Q2_K_PASCAL 64
#define NWARPS_Q2_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q2_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
    const int nwarps = NWARPS_Q2_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
    const int nwarps = NWARPS_Q2_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
    const int nwarps = NWARPS_Q2_K_AMPERE;

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
    const int nwarps = NWARPS_Q2_K_PASCAL;

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q3_K_RDNA2  128
#define  MMQ_Y_Q3_K_RDNA2  64
#define NWARPS_Q3_K_RDNA2  8
#define  MMQ_X_Q3_K_RDNA1  32
#define  MMQ_Y_Q3_K_RDNA1  128
#define NWARPS_Q3_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q3_K_AMPERE 4
#define  MMQ_Y_Q3_K_AMPERE 32
#define NWARPS_Q3_K_AMPERE 4
#else
#define  MMQ_X_Q3_K_AMPERE 128
#define  MMQ_Y_Q3_K_AMPERE 128
#define NWARPS_Q3_K_AMPERE 4
#endif
#define  MMQ_X_Q3_K_PASCAL 64
#define  MMQ_Y_Q3_K_PASCAL 64
#define NWARPS_Q3_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q3_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
    const int nwarps = NWARPS_Q3_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
    const int nwarps = NWARPS_Q3_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
    const int nwarps = NWARPS_Q3_K_AMPERE;

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
    const int nwarps = NWARPS_Q3_K_PASCAL;

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q4_K_RDNA2  64
#define  MMQ_Y_Q4_K_RDNA2  128
#define NWARPS_Q4_K_RDNA2  8
#define  MMQ_X_Q4_K_RDNA1  32
#define  MMQ_Y_Q4_K_RDNA1  64
#define NWARPS_Q4_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_K_AMPERE 4
#define  MMQ_Y_Q4_K_AMPERE 32
#define NWARPS_Q4_K_AMPERE 4
#else
#define  MMQ_X_Q4_K_AMPERE 64
#define  MMQ_Y_Q4_K_AMPERE 128
#define NWARPS_Q4_K_AMPERE 4
#endif
#define  MMQ_X_Q4_K_PASCAL 64
#define  MMQ_Y_Q4_K_PASCAL 64
#define NWARPS_Q4_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q4_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
    const int nwarps = NWARPS_Q4_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
    const int nwarps = NWARPS_Q4_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
    const int nwarps = NWARPS_Q4_K_AMPERE;

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
    const int nwarps = NWARPS_Q4_K_PASCAL;

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_K_RDNA2  64
#define  MMQ_Y_Q5_K_RDNA2  128
#define NWARPS_Q5_K_RDNA2  8
#define  MMQ_X_Q5_K_RDNA1  32
#define  MMQ_Y_Q5_K_RDNA1  64
#define NWARPS_Q5_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_K_AMPERE 4
#define  MMQ_Y_Q5_K_AMPERE 32
#define NWARPS_Q5_K_AMPERE 4
#else
#define  MMQ_X_Q5_K_AMPERE 64
#define  MMQ_Y_Q5_K_AMPERE 128
#define NWARPS_Q5_K_AMPERE 4
#endif
#define  MMQ_X_Q5_K_PASCAL 64
#define  MMQ_Y_Q5_K_PASCAL 64
#define NWARPS_Q5_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
    const int nwarps = NWARPS_Q5_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
    const int nwarps = NWARPS_Q5_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
    const int nwarps = NWARPS_Q5_K_AMPERE;

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
    const int nwarps = NWARPS_Q5_K_PASCAL;

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q6_K_RDNA2  64
#define  MMQ_Y_Q6_K_RDNA2  128
#define NWARPS_Q6_K_RDNA2  8
#define  MMQ_X_Q6_K_RDNA1  32
#define  MMQ_Y_Q6_K_RDNA1  64
#define NWARPS_Q6_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q6_K_AMPERE 4
#define  MMQ_Y_Q6_K_AMPERE 32
#define NWARPS_Q6_K_AMPERE 4
#else
#define  MMQ_X_Q6_K_AMPERE 64
#define  MMQ_Y_Q6_K_AMPERE 64
#define NWARPS_Q6_K_AMPERE 4
#endif
#define  MMQ_X_Q6_K_PASCAL 64
#define  MMQ_Y_Q6_K_PASCAL 64
#define NWARPS_Q6_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q6_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
    const int nwarps = NWARPS_Q6_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
    const int nwarps = NWARPS_Q6_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
    const int nwarps = NWARPS_Q6_K_AMPERE;

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
    const int nwarps = NWARPS_Q6_K_PASCAL;

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
#else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)

    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    const     int row0 = rows_per_cuda_block*blockIdx.x;
    const     int blocks_per_row_x = ncols_x / qk;
    const     int blocks_per_col_y = nrows_y / QK8_1;
    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;

// partial sum for each thread
    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx

        // x block quant index when casting the quants to int
        const int kqs = vdr * (tid % (qi/vdr));

#pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp[j][i] += vec_dot_q_cuda(
                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
            }
        }
    }

    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
    if (threadIdx.y > 0) {
#pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
            }
        }
    }
    __syncthreads();
    if (threadIdx.y > 0) {
        return;
    }

    // sum up partial sums and write back result
#pragma unroll
    for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
        for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
            for (int l = 0; l < nwarps-1; ++l) {
                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
            }
            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
        }

        if (threadIdx.x < rows_per_cuda_block) {
            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
    // qk = quantized weights per x block
    // qr = number of quantized weights per data value in x block
    const int row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
    }

    const int tid = threadIdx.x;

    const int iter_stride = 2*GGML_CUDA_DMMV_X;
    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
    const int y_offset = qr == 1 ? 1 : qk/2;

// partial sum for each thread
#ifdef GGML_CUDA_F16
    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
#else
    float tmp = 0.0f;
#endif // GGML_CUDA_F16

    for (int i = 0; i < ncols; i += iter_stride) {
        const int col = i + vals_per_iter*tid;
        const int ib = (row*ncols + col)/qk; // x block index
        const int iqs = (col%qk)/qr; // x quant index
        const int iybs = col - col%qk; // y block start index

// processing >2 values per i iter is faster for fast GPUs
#pragma unroll
        for (int j = 0; j < vals_per_iter; j += 2) {
            // process 2 vals per j iter

            // dequantize
            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
            dfloat2 v;
            dequantize_kernel(vx, ib, iqs + j/qr, v);

            // matrix multiplication
            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_F16
            tmp += __hmul2(v, {
                y[iybs + iqs + j/qr + 0],
                y[iybs + iqs + j/qr + y_offset]
            });
#else
            tmp += v.x * y[iybs + iqs + j/qr + 0];
            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
#endif // GGML_CUDA_F16
        }
    }

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
#ifdef GGML_CUDA_F16
        dst[row] = tmp.x + tmp.y;
#else
        dst[row] = tmp;
#endif // GGML_CUDA_F16
    }
}

static __global__ void mul_mat_p021_f16_f32(
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {

    const half * x = (const half *) vx;

    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / (nchannels_y / nchannels_x);

    const int nrows_y = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst = row_x;

    float tmp = 0.0f;

    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;

        if (col_x >= ncols_x) {
            break;
        }

        // x is transposed and permuted
        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
        const float xi = __half2float(x[ix]);

        const int row_y = col_x;

        // y is not transposed but permuted
        const int iy = channel*nrows_y + row_y;

        tmp += xi * y[iy];
    }

    // dst is not transposed and not permuted
    const int idst = channel*nrows_dst + row_dst;

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
}

static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {

    const half * x = (const half *) vx;

    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / channel_x_divisor;

    const int nrows_y   = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst   = row_x;

    const int idst = channel*nrows_dst + row_dst;

    float tmp = 0.0f;

    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;

        if (col_x >= ncols_x) {
            break;
        }

        const int row_y = col_x;

        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const int iy = channel*nrows_y + row_y;

        const float xi = __half2float(x[ix]);

        tmp += xi * y[iy];
    }

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
}

static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    float * dsti = (float *) cdsti;

    *dsti = *xi;
}

static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    half * dsti = (half *) cdsti;

    *dsti = __float2half(*xi);
}

static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
    const half * xi = (const half *) cxi;
    half * dsti = (half *) cdsti;

    *dsti = *xi;
}

static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
    const half * xi = (const half *) cxi;
    float * dsti = (float *) cdsti;

    *dsti = *xi;
}

template <cpy_kernel_t cpy_1>
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13) {
    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
    }

    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
    // then combine those indices with the corresponding byte offsets to get the total offsets
    const int64_t i03 = i/(ne00 * ne01 * ne02);
    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

    const int64_t i13 = i/(ne10 * ne11 * ne12);
    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;

    cpy_1(cx + x_offset, cdst + dst_offset);
}

static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;

    float amax = 0.0f; // absolute max

    for (int j = 0; j < QK8_0; j++) {
        const float v = xi[j];
        amax = fmaxf(amax, fabsf(v));
    }

    const float d = amax / ((1 << 7) - 1);
    const float id = d ? 1.0f/d : 0.0f;

    dsti->d = d;

    for (int j = 0; j < QK8_0; ++j) {
        const float x0 = xi[j]*id;

        dsti->qs[j] = roundf(x0);
    }
}

static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_0 * dsti = (block_q4_0 *) cdsti;

    float amax = 0.0f;
    float vmax = 0.0f;

    for (int j = 0; j < QK4_0; ++j) {
        const float v = xi[j];
        if (amax < fabsf(v)) {
            amax = fabsf(v);
            vmax = v;
        }
    }

    const float d  = vmax / -8;
    const float id = d ? 1.0f/d : 0.0f;

    dsti->d = d;

    for (int j = 0; j < QK4_0/2; ++j) {
        const float x0 = xi[0       + j]*id;
        const float x1 = xi[QK4_0/2 + j]*id;

        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));

        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
}

static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_1 * dsti = (block_q4_1 *) cdsti;

    float vmin = FLT_MAX;
    float vmax = -FLT_MAX;

    for (int j = 0; j < QK4_1; ++j) {
        const float v = xi[j];

        if (v < vmin) vmin = v;
        if (v > vmax) vmax = v;
    }

    const float d  = (vmax - vmin) / ((1 << 4) - 1);
    const float id = d ? 1.0f/d : 0.0f;

    dsti->dm.x = d;
    dsti->dm.y = vmin;

    for (int j = 0; j < QK4_1/2; ++j) {
        const float x0 = (xi[0       + j] - vmin)*id;
        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;

        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));

        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
}

static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q5_0 * dsti = (block_q5_0 *) cdsti;

    float amax = 0.0f;
    float vmax = 0.0f;

    for (int j = 0; j < QK5_0; ++j) {
        const float v = xi[j];
        if (amax < fabsf(v)) {
            amax = fabsf(v);
            vmax = v;
        }
    }

    const float d  = vmax / -16;
    const float id = d ? 1.0f/d : 0.0f;

    dsti->d = d;

    uint32_t qh = 0;
    for (int j = 0; j < QK5_0/2; ++j) {
        const float x0 = xi[0       + j]*id;
        const float x1 = xi[QK5_0/2 + j]*id;

        const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
        const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));

        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
    }
    memcpy(dsti->qh, &qh, sizeof(qh));
}

static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q5_1 * dsti = (block_q5_1 *) cdsti;

    float min = xi[0];
    float max = xi[0];

    for (int j = 1; j < QK5_1; ++j) {
        const float v = xi[j];
        min = v < min ? v : min;
        max = v > max ? v : max;
    }

    const float d  = (max - min) / 31;
    const float id = d ? 1.0f/d : 0.0f;

    dsti->dm.x = d;
    dsti->dm.y = min;

    uint32_t qh = 0;
    for (int j = 0; j < QK5_1/2; ++j) {
        const float x0 = (xi[0       + j] - min)*id;
        const float x1 = (xi[QK5_1/2 + j] - min)*id;

        const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
        const uint8_t xi1 = (uint8_t)(x1 + 0.5f);

        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
    }
    memcpy(dsti->qh, &qh, sizeof(qh));
}

static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
    if (x <= val[0]) return 0;
    if (x >= val[n-1]) return n-1;
    int ml = 0, mu = n-1;
    while (mu-ml > 1) {
        int mav = (ml+mu)/2;
        if (x < val[mav]) mu = mav; else ml = mav;
    }
    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
}

static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;

    float amax = 0.0f;
    float vmax = 0.0f;

    for (int j = 0; j < QK4_NL; ++j) {
        const float v = xi[j];
        if (amax < fabsf(v)) {
            amax = fabsf(v);
            vmax = v;
        }
    }

    float d = vmax / kvalues_iq4nl[0];
    const float id = d ? 1.0f/d : 0.0f;

    float sumqx = 0, sumq2 = 0;
    for (int j = 0; j < QK4_NL/2; ++j) {
        const float x0 = xi[0        + j]*id;
        const float x1 = xi[QK4_NL/2 + j]*id;
        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
        dsti->qs[j] = xi0 | (xi1 << 4);
        const float v0 = kvalues_iq4nl[xi0];
        const float v1 = kvalues_iq4nl[xi1];
        const float w0 = xi[0        + j]*xi[0        + j];
        const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
        sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
        sumq2 += w0*v0*v0 + w1*v1*v1;
    }

    dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
}


template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                 const int nb12, const int nb13) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

    const int i03 = i/(ne00 * ne01 * ne02);
    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

    const int i13 = i/(ne10 * ne11 * ne12);
    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

    cpy_blck(cx + x_offset, cdst + dst_offset);
}

static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
}

struct rope_corr_dims {
    float v[4];
};

// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static __device__ void rope_yarn(
    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
    float * cos_theta, float * sin_theta
) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
    }
    *cos_theta = cosf(theta) * mscale;
    *sin_theta = sinf(theta) * mscale;
}

// rope == RoPE == rotary positional embedding
template<typename T, bool has_pos>
static __global__ void rope(
    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims
) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
    const float theta_base = p*powf(freq_base, -float(col)/ncols);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + 1];

    dst[i + 0] = x0*cos_theta - x1*sin_theta;
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
}

template<typename T, bool has_pos>
static __global__ void rope_neox(
    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int ib = col / n_dims;
    const int ic = col % n_dims;

    if (ib > 0) {
        const int i = row*ncols + ib*n_dims + ic;

        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];

        return;
    }

    const int i  = row*ncols + ib*n_dims + ic/2;
    const int i2 = row/p_delta_rows;

    float cur_rot = inv_ndims * ic - ib;

    const int p = has_pos ? pos[i2] : 0;
    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + n_dims/2];

    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}

static __global__ void rope_glm_f32(
    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
    int n_ctx
) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;

    if (col >= half_n_dims) {
        return;
    }

    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
     // FIXME: this is likely wrong
    const int p = pos != nullptr ? pos[i2] : 0;

    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + half_n_dims];

    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);

    const float x2 = x[i + half_n_dims * 2];
    const float x3 = x[i + half_n_dims * 3];

    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
}

static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
                                 const int n_heads_log2_floor, const float m0, const float m1) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;

    const int k = row/k_rows;

    float m_k;
    if (k < n_heads_log2_floor) {
        m_k = powf(m0, k + 1);
    } else {
        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
    }

    dst[i] = col * m_k + x[i];
}

static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x;
    const int col = threadIdx.x;

    float sum = 0.0f;
    for (int i = col; i < ncols; i += blockDim.x) {
        sum += x[row * ncols + i];
    }

    sum = warp_reduce_sum(sum);

    if (col == 0) {
        dst[row] = sum;
    }
}

template<typename T>
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
    T tmp = a;
    a = b;
    b = tmp;
}

template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
    // bitonic sort
    int col = threadIdx.x;
    int row = blockIdx.y;

    if (col >= ncols) return;

    const float * x_row = x + row * ncols;
    int * dst_row = dst + row * ncols;

    // initialize indices
    if (col < ncols) {
        dst_row[col] = col;
    }
    __syncthreads();

    for (int k = 2; k <= ncols; k *= 2) {
        for (int j = k / 2; j > 0; j /= 2) {
            int ixj = col ^ j;
            if (ixj > col) {
                if ((col & k) == 0) {
                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
                    }
                } else {
                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
                    }
                }
            }
            __syncthreads();
        }
    }
}

static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
    const int col = blockDim.y*blockIdx.y + threadIdx.y;
    const int row = blockDim.x*blockIdx.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

    const int i = row*ncols + col;
    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
}

template <bool vals_smem, int ncols_template, int block_size_template>
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;

    const int tid  = threadIdx.x;
    const int rowx = blockIdx.x;
    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension

    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;

    float slope = 0.0f;

    // ALiBi
    if (max_bias > 0.0f) {
        const int h = rowx/nrows_y; // head index

        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;

        slope = powf(base, exp);
    }

    extern __shared__ float data_soft_max_f32[];
    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
    // shared memory buffer to cache values between iterations:
    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;

    float max_val = -INFINITY;

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            break;
        }

        const int ix = rowx*ncols + col;
        const int iy = rowy*ncols + col;

        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);

        vals[col] = val;
        max_val = max(max_val, val);
    }

    // find the max value in the block
    max_val = warp_reduce_max(max_val);
    if (block_size > WARP_SIZE) {
        if (warp_id == 0) {
            buf_iw[lane_id] = -INFINITY;
        }
        __syncthreads();

        if (lane_id == 0) {
            buf_iw[warp_id] = max_val;
        }
        __syncthreads();

        max_val = buf_iw[lane_id];
        max_val = warp_reduce_max(max_val);
    }

    float tmp = 0.0f; // partial sum

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            break;
        }

        const float val = expf(vals[col] - max_val);
        tmp += val;
        vals[col] = val;
    }

    // find the sum of exps in the block
    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __syncthreads();
        if (warp_id == 0) {
            buf_iw[lane_id] = 0.0f;
        }
        __syncthreads();

        if (lane_id == 0) {
            buf_iw[warp_id] = tmp;
        }
        __syncthreads();

        tmp = buf_iw[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    const float inv_sum = 1.0f / tmp;

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            return;
        }

        const int idst = rowx*ncols + col;
        dst[idst] = vals[col] * inv_sum;
    }
}

static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    dst[i] = scale * x[i];
}

static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
}

template <typename T>
static  __global__ void im2col_kernel(
        const float * x, T * dst, int64_t batch_offset,
        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
        int s0, int s1, int p0, int p1, int d0, int d1) {
    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i >= pelements) {
        return;
    }

    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
    const int64_t  kx = i / ksize;
    const int64_t  kd = kx * ksize;
    const int64_t  ky = (i - kd) / OW;
    const int64_t  ix = i % OW;

    const int64_t  oh = blockIdx.y;
    const int64_t  batch = blockIdx.z / IC;
    const int64_t  ic = blockIdx.z % IC;

    const int64_t iiw = ix * s0 + kx * d0 - p0;
    const int64_t iih = oh * s1 + ky * d1 - p1;

    const int64_t offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);

    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
        dst[offset_dst] = x[offset_src + iih * IW + iiw];
    }
}

template <typename Ti, typename To>
static  __global__ void pool2d_nchw_kernel(
        const int ih, const int iw, const int oh, const int ow,
        const int kh, const int kw, const int sh, const int sw,
        const int ph, const int pw, const int parallel_elements,
        const Ti* src, To* dst, const enum ggml_op_pool op) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= parallel_elements) {
        return;
    }

    const int I_HW = ih * iw;
    const int O_HW = oh * ow;
    const int nc = idx / O_HW;
    const int cur_oh = idx % O_HW / ow;
    const int cur_ow = idx % O_HW % ow;
    const Ti* i_ptr = src + nc * I_HW;
    To* o_ptr = dst + nc * O_HW;
    const int start_h = cur_oh * sh - ph;
    const int bh = max(0, start_h);
    const int eh = min(ih, start_h + kh);
    const int start_w = cur_ow * sw - pw;
    const int bw = max(0, start_w);
    const int ew = min(iw, start_w + kw);
    const To scale = 1. / (kh * kw);
    To res = 0;

    switch (op) {
        case GGML_OP_POOL_AVG: res = 0; break;
        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
        default: assert(false);
    }

    for (int i = bh; i < eh; i += 1) {
        for (int j = bw; j < ew; j += 1) {
#if __CUDA_ARCH__ >= 350
            Ti cur = __ldg(i_ptr + i * iw + j);
#else
            Ti cur = i_ptr[i * iw + j];
#endif
            switch (op) {
                case GGML_OP_POOL_AVG: res += cur * scale; break;
                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
                default: assert(false);
            }
        }
    }
    o_ptr[cur_oh * ow + cur_ow] = res;
}

template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

    GGML_TENSOR_BINARY_OP_LOCALS

    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    //const size_t s0 = nb0 / ggml_element_size(dst);
    const size_t s1 = nb1 / ggml_element_size(dst);
    const size_t s2 = nb2 / ggml_element_size(dst);
    const size_t s3 = nb3 / ggml_element_size(dst);

    const size_t s10 = nb10 / ggml_element_size(src1);
    const size_t s11 = nb11 / ggml_element_size(src1);
    const size_t s12 = nb12 / ggml_element_size(src1);
    //const size_t s13 = nb13 / ggml_element_size(src1);

    GGML_ASSERT(ne00 % 2 == 0);

    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
            src0_dd, src1_dd, dst_dd,
            ne00, /*ne01, ne02, ne03,*/
            /*ne10, ne11,*/ ne12, /*ne13,*/
            /* s0,*/ s1, s2, s3,
            /* nb00,*/ nb01, nb02, nb03,
            s10, s11, s12/*, s13*/);

    GGML_UNUSED(dst);
}

template<typename src0_t>
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

    GGML_TENSOR_BINARY_OP_LOCALS

    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    //const size_t s0 = nb0 / ggml_element_size(dst);
    const size_t s1 = nb1 / ggml_element_size(dst);
    const size_t s2 = nb2 / ggml_element_size(dst);
    const size_t s3 = nb3 / ggml_element_size(dst);

    const size_t s10 = nb10 / ggml_element_size(src1);
    const size_t s11 = nb11 / ggml_element_size(src1);
    const size_t s12 = nb12 / ggml_element_size(src1);
    //const size_t s13 = nb13 / ggml_element_size(src1);

    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
            src0_dd, src1_dd, dst_dd,
            ne00, /*ne01, ne02, ne03,*/
            /*ne10, ne11,*/ ne12, /*ne13,*/
            /* s0,*/ s1, s2, s3,
            /* nb00,*/ nb01, nb02, nb03,
            s10, s11, s12/*, s13*/);

    GGML_UNUSED(dst);
}

template<float (*bin_op)(const float, const float)>
struct bin_bcast_cuda {
    template<typename src0_t, typename src1_t, typename dst_t>
    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
            cudaStream_t stream) {

        GGML_TENSOR_BINARY_OP_LOCALS

        int nr0 = ne10/ne0;
        int nr1 = ne11/ne1;
        int nr2 = ne12/ne2;
        int nr3 = ne13/ne3;

        int nr[4] = { nr0, nr1, nr2, nr3 };

        // collapse dimensions until first broadcast dimension
        int64_t cne0[] = {ne0, ne1, ne2, ne3};
        int64_t cne1[] = {ne10, ne11, ne12, ne13};
        size_t cnb0[] = {nb0, nb1, nb2, nb3};
        size_t cnb1[] = {nb10, nb11, nb12, nb13};
        auto collapse = [](int64_t cne[]) {
            cne[0] *= cne[1];
            cne[1] = cne[2];
            cne[2] = cne[3];
            cne[3] = 1;
        };

        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
            cnb[1] *= cne[1];
            cnb[2] *= cne[2];
            cnb[3] *= cne[3];
        };

        for (int i = 0; i < 4; i++) {
            if (nr[i] != 1) {
                break;
            }
            if (i > 0) {
                collapse_nb(cnb0, cne0);
                collapse_nb(cnb1, cne1);
                collapse(cne0);
                collapse(cne1);
            }
        }
        {
            int64_t ne0 = cne0[0];
            int64_t ne1 = cne0[1];
            int64_t ne2 = cne0[2];
            int64_t ne3 = cne0[3];

            int64_t ne10 = cne1[0];
            int64_t ne11 = cne1[1];
            int64_t ne12 = cne1[2];
            int64_t ne13 = cne1[3];

            size_t nb0 = cnb0[0];
            size_t nb1 = cnb0[1];
            size_t nb2 = cnb0[2];
            size_t nb3 = cnb0[3];

            size_t nb10 = cnb1[0];
            size_t nb11 = cnb1[1];
            size_t nb12 = cnb1[2];
            size_t nb13 = cnb1[3];

            size_t s0 = nb0 / sizeof(dst_t);
            size_t s1 = nb1 / sizeof(dst_t);
            size_t s2 = nb2 / sizeof(dst_t);
            size_t s3 = nb3 / sizeof(dst_t);

            size_t s10 = nb10 / sizeof(src1_t);
            size_t s11 = nb11 / sizeof(src1_t);
            size_t s12 = nb12 / sizeof(src1_t);
            size_t s13 = nb13 / sizeof(src1_t);

            GGML_ASSERT(s0 == 1);
            GGML_ASSERT(s10 == 1);

            const int block_size = 128;

            int64_t hne0 = std::max(ne0/2LL, 1LL);

            dim3 block_dims;
            block_dims.x = std::min<unsigned int>(hne0, block_size);
            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);

            dim3 block_nums(
                (hne0 + block_dims.x - 1) / block_dims.x,
                (ne1 + block_dims.y - 1) / block_dims.y,
                (ne2*ne3 + block_dims.z - 1) / block_dims.z
            );

            if (block_nums.z > 65535) {
                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s10, */ s11, s12, s13);
            } else {
                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s10, */ s11, s12, s13);
            }
        }
    }
};

static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
    const int ne10, const int ne11, const int ne12,
    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
}

static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
}

static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
}

static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
    static const float eps = 1e-6f;
    if (group_size < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    }
}

static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne1, ne2);
    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
}

static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
                             const int scale_factor, cudaStream_t stream) {
    int ne0 = (ne00 * scale_factor);
    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
}

static void pad_f32_cuda(const float * x, float * dst,
    const int ne00, const int ne01, const int ne02, const int ne03,
    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne1, ne2*ne3);
    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
}

static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
}

static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
                                        const int dim, const int max_period, cudaStream_t stream) {
    int half_ceil = (dim + 1) / 2;
    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne00, 1);
    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
}

static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
}

static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
    const dim3 num_blocks(block_num_x, ky, 1);
    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
        const bool need_check = false;
        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
    } else {
        const bool need_check = true;
        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
    }
}

template<typename dst_t>
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
#if QK_K == 64
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
#else
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
    int id;
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            CUDA_CHECK(cudaGetDevice(&id));
            if (get_cuda_global_info().devices[id].cc >= CC_PASCAL) {
                return dequantize_block_q8_0_f16_cuda;
            }
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
        case GGML_TYPE_Q2_K:
            return dequantize_row_q2_K_cuda;
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_cuda;
        case GGML_TYPE_Q4_K:
            return dequantize_row_q4_K_cuda;
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_cuda;
        case GGML_TYPE_Q6_K:
            return dequantize_row_q6_K_cuda;
        case GGML_TYPE_IQ2_XXS:
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ2_S:
            return dequantize_row_iq2_s_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cuda<float>;
        default:
            return nullptr;
    }
}

static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
        case GGML_TYPE_Q2_K:
            return dequantize_row_q2_K_cuda;
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_cuda;
        case GGML_TYPE_Q4_K:
            return dequantize_row_q4_K_cuda;
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_cuda;
        case GGML_TYPE_Q6_K:
            return dequantize_row_q6_K_cuda;
        case GGML_TYPE_IQ2_XXS:
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ2_S:
            return dequantize_row_iq2_s_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cuda<half>;
        default:
            return nullptr;
    }
}

static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 1, 1);
    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
}

static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<1, 1, convert_f16>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
static void mul_mat_vec_q_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {

    GGML_ASSERT(ncols_x % qk == 0);
    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    int64_t nwarps = 1;
    int64_t rows_per_cuda_block = 1;

    if (get_cuda_global_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
        switch(ncols_y) {
            case 1:
                nwarps = 4;
                rows_per_cuda_block = 1;
                break;
            case 2:
            case 3:
            case 4:
                nwarps = 4;
                rows_per_cuda_block = 2;
                break;
            case 5:
            case 6:
            case 7:
            case 8:
                nwarps = 2;
                rows_per_cuda_block = 2;
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
    }
    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
    const dim3 block_nums(nblocks, 1, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    switch (ncols_y) {
        case 1:
            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 2:
            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 3:
            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 4:
            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 5:
            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 6:
            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 7:
            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 8:
            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }
}

static void ggml_mul_mat_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_0_RDNA2;
        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
        nwarps = NWARPS_Q4_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_0_RDNA1;
        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
        nwarps = NWARPS_Q4_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_0_AMPERE;
        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
        nwarps = NWARPS_Q4_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_0_PASCAL;
        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
        nwarps = NWARPS_Q4_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q4_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_1_RDNA2;
        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
        nwarps = NWARPS_Q4_1_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_1_RDNA1;
        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
        nwarps = NWARPS_Q4_1_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_1_AMPERE;
        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
        nwarps = NWARPS_Q4_1_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_1_PASCAL;
        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
        nwarps = NWARPS_Q4_1_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_0_RDNA2;
        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
        nwarps = NWARPS_Q5_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_0_RDNA1;
        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
        nwarps = NWARPS_Q5_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_0_AMPERE;
        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
        nwarps = NWARPS_Q5_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_0_PASCAL;
        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
        nwarps = NWARPS_Q5_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_1_RDNA2;
        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
        nwarps = NWARPS_Q5_1_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_1_RDNA1;
        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
        nwarps = NWARPS_Q5_1_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_1_AMPERE;
        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
        nwarps = NWARPS_Q5_1_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_1_PASCAL;
        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
        nwarps = NWARPS_Q5_1_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q8_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q8_0_RDNA2;
        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
        nwarps = NWARPS_Q8_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q8_0_RDNA1;
        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
        nwarps = NWARPS_Q8_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q8_0_AMPERE;
        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
        nwarps = NWARPS_Q8_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q8_0_PASCAL;
        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
        nwarps = NWARPS_Q8_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q2_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q2_K_RDNA2;
        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
        nwarps = NWARPS_Q2_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q2_K_RDNA1;
        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
        nwarps = NWARPS_Q2_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q2_K_AMPERE;
        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
        nwarps = NWARPS_Q2_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q2_K_PASCAL;
        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
        nwarps = NWARPS_Q2_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q3_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

#if QK_K == 256

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q3_K_RDNA2;
        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
        nwarps = NWARPS_Q3_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q3_K_RDNA1;
        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
        nwarps = NWARPS_Q3_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q3_K_AMPERE;
        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
        nwarps = NWARPS_Q3_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q3_K_PASCAL;
        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
        nwarps = NWARPS_Q3_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
#endif
}

static void ggml_mul_mat_q4_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_K_RDNA2;
        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
        nwarps = NWARPS_Q4_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_K_RDNA1;
        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
        nwarps = NWARPS_Q4_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_K_AMPERE;
        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
        nwarps = NWARPS_Q4_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_K_PASCAL;
        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
        nwarps = NWARPS_Q4_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_K_RDNA2;
        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
        nwarps = NWARPS_Q5_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_K_RDNA1;
        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
        nwarps = NWARPS_Q5_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_K_AMPERE;
        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
        nwarps = NWARPS_Q5_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_K_PASCAL;
        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
        nwarps = NWARPS_Q5_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q6_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = get_cuda_global_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q6_K_RDNA2;
        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
        nwarps = NWARPS_Q6_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q6_K_RDNA1;
        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
        nwarps = NWARPS_Q6_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q6_K_AMPERE;
        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
        nwarps = NWARPS_Q6_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q6_K_PASCAL;
        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
        nwarps = NWARPS_Q6_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_p021_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {

    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
}

static void ggml_mul_mat_vec_nc_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {

    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
}


static void ggml_cpy_f16_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q8_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q4_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q5_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK5_0 == 0);
    const int num_blocks = ne / QK5_0;
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q5_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK5_1 == 0);
    const int num_blocks = ne / QK5_1;
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_iq4_nl_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_NL == 0);
    const int num_blocks = ne / QK4_NL;
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}


static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
}

static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
}

template<typename T>
static void rope_cuda(
    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    if (pos == nullptr) {
        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    } else {
        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    }
}

template<typename T>
static void rope_neox_cuda(
    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.0f / n_dims;

    if (pos == nullptr) {
        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
            theta_scale, inv_ndims
        );
    } else {
        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
            theta_scale, inv_ndims
        );
    }
}

static void rope_glm_f32_cuda(
    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, int n_ctx, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 4 == 0);
    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
}

static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
                           const int k_rows, const int n_heads_log2_floor, const float m0,
                           const float m1, cudaStream_t stream) {
    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
    const dim3 block_nums(num_blocks_x, nrows, 1);
    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
}

static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
}

static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
    // bitonic sort requires ncols to be power of 2
    GGML_ASSERT((ncols & (ncols - 1)) == 0);

    const dim3 block_dims(ncols, 1, 1);
    const dim3 block_nums(1, nrows, 1);
    if (order == GGML_SORT_ORDER_ASC) {
        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
    } else if (order == GGML_SORT_ORDER_DESC) {
        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
    } else {
        GGML_ASSERT(false);
    }
}

static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
    const dim3 block_nums(nrows_x, block_num_x, 1);
    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
}

static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
    int nth = WARP_SIZE;
    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
    const dim3 block_dims(nth,     1, 1);
    const dim3 block_nums(nrows_x, 1, 1);
    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");

    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));

    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

    if (shmem < get_cuda_global_info().devices[ggml_cuda_get_device()].smpb) {
        switch (ncols_x) {
            case 32:
                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 64:
                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 128:
                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 256:
                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 512:
                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 1024:
                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 2048:
                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 4096:
                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            default:
                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
        }
    } else {
        const size_t shmem_low = WARP_SIZE*sizeof(float);
        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
    }
}

template <typename T>
static void im2col_cuda(const float * x, T* dst,
    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
    int64_t batch, int64_t batch_offset, int64_t offset_delta,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
    const int parallel_elements = OW * KW * KH;
    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
    dim3 block_nums(num_blocks, OH, batch * IC);
    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
}

static cudaError_t ggml_cuda_cpy_tensor_2d(
    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {

    GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
    char * src_ptr = (char *) src->data;
    char * dst_ptr = (char *) dst;

    const int64_t ne0 = src->ne[0];
    const int64_t nb0 = src->nb[0];
    const int64_t nb1 = src->nb[1];
    const int64_t nb2 = src->nb[2];
    const int64_t nb3 = src->nb[3];
    const enum ggml_type type = src->type;
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
    int64_t i1_diff = i1_high - i1_low;

    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
    if (nb0 == ts && nb1 == ts*ne0/bs) {
        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
    } else if (nb0 == ts) {
        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
    } else {
        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
            const void * rx = (const void *) ((const char *) x + i1*nb1);
            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
            // pretend the row is a matrix with cols=1
            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
            if (r != cudaSuccess) {
                return r;
            }
        }
        return cudaSuccess;
    }
}

static void ggml_cuda_op_get_rows(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {

    GGML_UNUSED(ctx);

    GGML_ASSERT(src1->type == GGML_TYPE_I32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));

    const int32_t * src1_i32 = (const int32_t *) src1_d;

    switch (src0->type) {
        case GGML_TYPE_F16:
            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_F32:
            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q4_0:
            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q4_1:
            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q5_0:
            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q5_1:
            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q8_0:
            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        default:
            // TODO: k-quants
            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
            GGML_ASSERT(false);
            break;
    }
}

template<class op>
static void ggml_cuda_op_bin_bcast(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_UNUSED(ctx);

    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
        GGML_ASSERT(false);
    }
}

static void ggml_cuda_op_repeat(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(src1_d);
}

static void ggml_cuda_op_add(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_acc(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_UNUSED(ctx);

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
    int offset = dst->op_params[3] / 4; // offset in bytes

    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);

    GGML_UNUSED(dst);
}

static void ggml_cuda_op_mul(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_div(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_gelu(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_silu(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_gelu_quick(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_tanh(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_relu(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_hardsigmoid(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_hardswish(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_leaky_relu(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float negative_slope;
    memcpy(&negative_slope, dst->op_params, sizeof(float));

    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_sqr(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_norm(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_group_norm(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int num_groups = dst->op_params[0];
    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_concat(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
    }

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
}

static void ggml_cuda_op_upscale(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

    const int scale_factor = dst->op_params[0];

    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_pad(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

    pad_f32_cuda(src0_dd, dst_dd,
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_arange(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    float start;
    float stop;
    float step;
    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));

    int64_t steps = (int64_t)ceil((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);

    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);

    GGML_UNUSED(src0);
    GGML_UNUSED(src1);
    GGML_UNUSED(src0_dd);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_timestep_embedding(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];

    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_rms_norm(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_mul_mat_q(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];

    const int64_t ne10 = src1->ne[0];
    GGML_ASSERT(ne10 % QK8_1 == 0);

    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;

    int id = ggml_cuda_get_device();

    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_1:
            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_0:
            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_1:
            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q8_0:
            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q2_K:
            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q3_K:
            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_K:
            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_K:
            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q6_K:
            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddf_i);
}

static void ggml_cuda_op_mul_mat_vec_q(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

    const int64_t ne10 = src1->ne[0];
    GGML_ASSERT(ne10 % QK8_1 == 0);

    const int64_t ne0 = dst->ne[0];

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_1:
            mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_0:
            mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_1:
            mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q8_0:
            mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q2_K:
            mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q3_K:
            mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_K:
            mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_K:
            mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q6_K:
            mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_XXS:
            mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_XS:
            mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_S:
            mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ3_XXS:
            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ4_NL:
            mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ3_S:
            mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddf_i);
    GGML_UNUSED(src1_ncols);
    GGML_UNUSED(src1_padded_row_size);
}

static void ggml_cuda_op_dequantize_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {
    GGML_UNUSED(ctx);
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
#ifdef GGML_CUDA_F16
    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
    half * src1_dfloat = nullptr; // dfloat == half

    bool src1_convert_f16 =
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

    if (src1_convert_f16) {
        src1_dfloat = src1_dfloat_a.alloc(ne00);
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
    }
#else
    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
#endif // GGML_CUDA_F16

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_1:
            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_0:
            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_1:
            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q8_0:
            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q2_K:
            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q3_K:
            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_K:
            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_K:
            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q6_K:
            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_F16:
            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddq_i);
    GGML_UNUSED(src1_ncols);
    GGML_UNUSED(src1_padded_row_size);
}

static void ggml_cuda_op_mul_mat_cublas(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    GGML_ASSERT(src0_dd_i  != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
    GGML_ASSERT(dst_dd_i   != nullptr);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;

    int id = ggml_cuda_get_device();

    // the main device has a larger memory buffer to hold the results from all GPUs
    // ldc == nrows of the matrix that cuBLAS writes into
    int ldc = id == ctx.device ? ne0 : row_diff;

    const int compute_capability = get_cuda_global_info().devices[id].cc;

    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
        if (src0->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = row_diff*ne00;
            src0_as_f16.alloc(ne);
            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();

        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
        if (src1->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = src1_ncols*ne10;
            src1_as_f16.alloc(ne);
            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);

        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;

        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
        CUBLAS_CHECK(
            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
                                src1_ptr,       CUDA_R_16F, ne10,
                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
                    CUBLAS_COMPUTE_16F,
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));

        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
    } else {
        ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
        ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));

        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
            src0_ddq_as_f32.alloc(row_diff*ne00);
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
        if (src1->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
            src1_ddq_as_f32.alloc(src1_ncols*ne10);
            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
        }

        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();

        const float alpha = 1.0f;
        const float beta = 0.0f;

        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
        CUBLAS_CHECK(
            cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha, src0_ddf_i,  ne00,
                            src1_ddf1_i, ne10,
                    &beta,  dst_dd_i,    ldc));
    }

    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddq_i);
    GGML_UNUSED(src1_padded_row_size);
}

static void ggml_cuda_op_rope(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(src0->type == dst->type);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t nrows = ggml_nrows(src0);

    //const int n_past      = ((int32_t *) dst->op_params)[0];
    const int n_dims      = ((int32_t *) dst->op_params)[1];
    const int mode        = ((int32_t *) dst->op_params)[2];
    const int n_ctx       = ((int32_t *) dst->op_params)[3];
    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];

    // RoPE alteration for extended context
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

    const int32_t * pos = nullptr;
    if ((mode & 1) == 0) {
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
        GGML_ASSERT(src1->ne[0] == ne2);
        pos = (const int32_t *) src1_dd;
    }

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

    rope_corr_dims corr_dims;
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);

    // compute
    if (is_glm) {
        GGML_ASSERT(false);
        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
    } else if (is_neox) {
        if (src0->type == GGML_TYPE_F32) {
            rope_neox_cuda(
                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_neox_cuda(
                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else {
            GGML_ASSERT(false);
        }
    } else {
        if (src0->type == GGML_TYPE_F32) {
            rope_cuda(
                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_cuda(
                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else {
            GGML_ASSERT(false);
        }
    }

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_alibi(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t nrows = ggml_nrows(src0);

    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

    //GGML_ASSERT(ne01 + n_past == ne00);
    GGML_ASSERT(n_head == ne02);

    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));

    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_pool2d(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int32_t * opts = (const int32_t *)dst->op_params;
    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
    const int k0 = opts[1];
    const int k1 = opts[2];
    const int s0 = opts[3];
    const int s1 = opts[4];
    const int p0 = opts[5];
    const int p1 = opts[6];

    const int64_t IH = src0->ne[1];
    const int64_t IW = src0->ne[0];

    const int64_t N = dst->ne[3];
    const int64_t OC = dst->ne[2];
    const int64_t OH = dst->ne[1];
    const int64_t OW = dst->ne[0];

    const int parallel_elements = N * OC * OH * OW;
    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
    dim3 block_nums(num_blocks);
    pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);

    GGML_UNUSED(src1);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_im2col(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

    const int64_t IC = src1->ne[is_2D ? 2 : 1];
    const int64_t IH = is_2D ? src1->ne[1] : 1;
    const int64_t IW =         src1->ne[0];

    const int64_t KH = is_2D ? src0->ne[1] : 1;
    const int64_t KW =         src0->ne[0];

    const int64_t OH = is_2D ? dst->ne[2] : 1;
    const int64_t OW =         dst->ne[1];

    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
    const int64_t batch = src1->ne[3];
    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

    if(dst->type == GGML_TYPE_F16) {
        im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
    } else {
        im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
    }

    GGML_UNUSED(src0);
    GGML_UNUSED(src0_dd);
}

static void ggml_cuda_op_sum_rows(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_argsort(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);

    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_diag_mask_inf(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int nrows0 = ggml_nrows(src0);

    const int n_past = ((int32_t *) dst->op_params)[0];

    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_soft_max(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional

    const int64_t ne00    = src0->ne[0];
    const int64_t nrows_x = ggml_nrows(src0);
    const int64_t nrows_y = src0->ne[1];

    float scale    = 1.0f;
    float max_bias = 0.0f;

    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));

    // positions tensor
    float * src2_dd = nullptr;

    ggml_tensor * src2 = dst->src[2];
    const bool use_src2 = src2 != nullptr;

    if (use_src2) {
        src2_dd = (float *)src2->data;
    }

    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
}

static void ggml_cuda_op_scale(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float scale;
    memcpy(&scale, dst->op_params, sizeof(float));

    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

static void ggml_cuda_op_clamp(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_UNUSED(ctx);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float min;
    float max;
    memcpy(&min, dst->op_params, sizeof(float));
    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));

    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());

    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_dd);
}

// TODO: remove this function
static void ggml_cuda_op_flatten(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
    GGML_ASSERT(!src0 || ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(!src1 || ggml_backend_buffer_is_cuda(src1->buffer));
    GGML_ASSERT(         ggml_backend_buffer_is_cuda(dst->buffer));

    // dd = data device
    float * src0_ddf = src0 ? (float *) src0->data : nullptr;
    float * src1_ddf = src1 ? (float *) src1->data : nullptr;
    float *  dst_ddf = (float *) dst->data;

    ggml_cuda_set_device(ctx.device);

    // do the computation
    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, ctx.stream());
    CUDA_CHECK(cudaGetLastError());
}

static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
    static bool peer_access_enabled = false;

    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;

    if (peer_access_enabled == enable_peer_access) {
        return;
    }

#ifdef NDEBUG
    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        ggml_cuda_set_device(id);
        CUDA_CHECK(cudaDeviceSynchronize());
    }

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        ggml_cuda_set_device(id);

        for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
            if (id == id_other) {
                continue;
            }
            if (id != main_device && id_other != main_device) {
                continue;
            }

            int can_access_peer;
            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
            if (can_access_peer) {
                if (enable_peer_access) {
                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
                        CUDA_CHECK(err);
                    }
                } else {
                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                    if (err != cudaErrorPeerAccessNotEnabled) {
                        CUDA_CHECK(err);
                    }
                }
            }
        }
    }
#endif // NDEBUG

    peer_access_enabled = enable_peer_access;

    GGML_UNUSED(main_device);
}

static void ggml_cuda_op_mul_mat(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
    const bool convert_src1_to_q8_1) {

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nrows1 = ggml_nrows(src1);

    GGML_ASSERT(ne03 == ne13);

    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

    const int nb2 = dst->nb[2];
    const int nb3 = dst->nb[3];

    GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
    ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
    ggml_backend_cuda_buffer_context * dst_ctx  = (ggml_backend_cuda_buffer_context *) dst->buffer->context;

    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));

    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);

    const int64_t i02_divisor = ne12 / ne02;

    const size_t src0_ts = ggml_type_size(src0->type);
    const size_t src0_bs = ggml_blck_size(src0->type);
    const size_t q8_1_ts = sizeof(block_q8_1);
    const size_t q8_1_bs = QK8_1;

    const bool src0_is_contiguous = ggml_is_contiguous(src0);
    const bool src1_is_contiguous = ggml_is_contiguous(src1);

    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);

    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
    GGML_ASSERT(!(split && ne02 > 1));
    GGML_ASSERT(!(split && ne03 > 1));
    GGML_ASSERT(!(split && ne02 < ne12));

    ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;


    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        tensor_split = buft_ctx->tensor_split;
    }

    struct dev_data {
        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
        ggml_cuda_pool_alloc<float> src1_ddf_alloc;
        ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
        ggml_cuda_pool_alloc<float>   dst_dd_alloc;

        char  *  src0_dd = nullptr;
        float * src1_ddf = nullptr; // float
        char  * src1_ddq = nullptr; // q8_1
        float *   dst_dd = nullptr;

        int64_t  row_low;
        int64_t row_high;
    };

    dev_data dev[GGML_CUDA_MAX_DEVICES];

    int used_devices = 0;

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        // by default, use all rows
        dev[id].row_low  = 0;
        dev[id].row_high = ne01;

        // for multi GPU, get the row boundaries from tensor split
        // and round to mul_mat_q tile sizes
        if (split) {
            const int64_t rounding = get_row_rounding(src0->type, tensor_split);

            if (id != 0) {
                dev[id].row_low  = ne01*tensor_split[id];
                if (dev[id].row_low < ne01) {
                    dev[id].row_low -= dev[id].row_low % rounding;
                }
            }

            if (id != ggml_backend_cuda_get_device_count() - 1) {
                dev[id].row_high  = ne01*tensor_split[id + 1];
                if (dev[id].row_high < ne01) {
                    dev[id].row_high -= dev[id].row_high % rounding;
                }
            }
        }
    }

    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
            continue;
        }

        used_devices++;

        const bool src1_on_device = id == src1_ctx->device;
        const bool  dst_on_device = id == dst_ctx->device;

        ggml_cuda_set_device(id);
        cudaStream_t stream = ctx.stream(id, 0);

        if (src0_is_contiguous) {
            dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
        } else {
            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
        }

        if (src1_on_device && src1_is_contiguous) {
            dev[id].src1_ddf = (float *) src1->data;
        } else {
            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
        }

        if (convert_src1_to_q8_1) {
            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

            if (src1_on_device && src1_is_contiguous) {
                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                CUDA_CHECK(cudaGetLastError());
            }
        }

        if (dst_on_device) {
            dev[id].dst_dd = (float *) dst->data;
        } else {
            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
        }
    }

    // if multiple devices are used they need to wait for the main device
    // here an event is recorded that signals that the main device has finished calculating the input data
    if (split && used_devices > 1) {
        ggml_cuda_set_device(ctx.device);
        CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
    }

    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
        const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
            if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                continue;
            }

            const bool src1_on_device = id == src1_ctx->device;
            const bool  dst_on_device = id == dst_ctx->device;
            const int64_t row_diff = dev[id].row_high - dev[id].row_low;

            ggml_cuda_set_device(id);
            cudaStream_t stream = ctx.stream(id, is);

            // wait for main GPU data if necessary
            if (split && (id != ctx.device || is != 0)) {
                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
            }

            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
                const int64_t i03 = i0 / ne12;
                const int64_t i02 = i0 % ne12;

                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;

                // for split tensors the data begins at i0 == i0_offset_low
                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);

                // the main device memory buffer can be on VRAM scratch, with space for all partial results
                // in that case an offset on dst_ddf_i is needed
                if (id == ctx.device) {
                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
                }

                // copy src0, src1 to device if necessary
                if (src1_is_contiguous) {
                    if (id != ctx.device) {
                        if (convert_src1_to_q8_1) {
                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
                        } else {
                            float * src1_ddf_i_source = (float *) src1->data;
                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
                                                            src1_ncols*ne10*sizeof(float), stream));
                        }
                    }
                } else if (src1_on_device && !src1_is_contiguous) {
                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                } else {
                    GGML_ASSERT(false);
                }

                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                    CUDA_CHECK(cudaGetLastError());
                }

                if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
                }

                // do the computation
                op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
                CUDA_CHECK(cudaGetLastError());

                // copy dst to host or other device if necessary
                if (!dst_on_device) {
                    void * dst_off_device = dst->data;
                    if (split) {
                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
                        // dst is NOT transposed.
                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
#if !defined(GGML_USE_HIPBLAS)
                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
                        cudaMemcpy3DPeerParms p = {};
                        p.dstDevice = ctx.device;
                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
                        p.srcDevice = id;
                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
#else
                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
                                                        dst_dd_i, row_diff*sizeof(float),
                                                        row_diff*sizeof(float), src1_ncols,
                                                        cudaMemcpyDeviceToDevice, stream));
#endif
                    } else {
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                        dhf_dst_i += src1_col_0*ne0;
                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
                    }
                }

                // add event for the main device to wait on until other device is done
                if (split && (id != ctx.device || is != 0)) {
                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
                }
            }
        }
    }

    // main device waits for all other devices to be finished
    if (split && ggml_backend_cuda_get_device_count() > 1) {
        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
        is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;

        ggml_cuda_set_device(ctx.device);
        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
            if (dev[id].row_low == dev[id].row_high) {
                continue;
            }
            for (int64_t is = 0; is < is_max; ++is) {
                CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
            }
        }
    }
}

static void ggml_cuda_repeat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_repeat);
}

static void ggml_cuda_get_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_get_rows);
}

static void ggml_cuda_add(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_add);
}

static void ggml_cuda_acc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_acc);
}

static void ggml_cuda_mul(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_mul);
}

static void ggml_cuda_div(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_div);
}

static void ggml_cuda_gelu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu);
}

static void ggml_cuda_silu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_silu);
}

static void ggml_cuda_gelu_quick(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu_quick);
}

static void ggml_cuda_tanh(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_tanh);
}

static void ggml_cuda_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_relu);
}

static void ggml_cuda_hardsigmoid(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardsigmoid);
}

static void ggml_cuda_hardswish(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardswish);
}
static void ggml_cuda_leaky_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_leaky_relu);
}

static void ggml_cuda_sqr(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sqr);
}

static void ggml_cuda_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_norm);
}

static void ggml_cuda_group_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_group_norm);
}

static void ggml_cuda_concat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_concat);
}

static void ggml_cuda_upscale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_upscale);
}

static void ggml_cuda_pad(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pad);
}

static void ggml_cuda_arange(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_arange);
}

static void ggml_cuda_timestep_embedding(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_timestep_embedding);
}

static void ggml_cuda_rms_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rms_norm);
}

static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    const int64_t ne12 = src1->ne[2];

    ggml_cuda_set_device(ctx.device);
    cudaStream_t main_stream = ctx.stream();

    void  * src0_ddq = src0->data;
    float * src1_ddf = (float *) src1->data;
    float * dst_ddf  = (float *) dst->data;

    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
}

static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];

    const int64_t ne12 = src1->ne[2];

    ggml_cuda_set_device(ctx.device);
    cudaStream_t main_stream = ctx.stream();

    void  * src0_ddq = src0->data;
    float * src1_ddf = (float *) src1->data;
    float * dst_ddf  = (float *) dst->data;

    const int64_t row_stride_x = nb01 / sizeof(half);
    const int64_t channel_stride_x = nb02 / sizeof(half);

    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
}

static __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
        int64_t ne12, int64_t ne13,
        int64_t ne23,
        size_t  nb02, size_t  nb03,
        size_t  nb12, size_t  nb13,
        size_t  nbd2, size_t  nbd3,
        int64_t r2,   int64_t r3) {
    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;

    if (i13 >= ne13 || i12 >= ne12) {
        return;
    }

    int64_t i03 = i13 / r3;
    int64_t i02 = i12 / r2;

    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
}

static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));

    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int64_t ne_dst = ggml_nelements(dst);

    ggml_cuda_set_device(ctx.device);
    cudaStream_t main_stream = ctx.stream();

    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));

    void * src0_ddq = src0->data;
    half * src0_f16 = (half *) src0_ddq;
    float * src1_ddf = (float *) src1->data;
    float * dst_ddf  = (float *) dst->data;

    // convert src1 to fp16
    ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
    if (src1->type != GGML_TYPE_F16) {
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
        const int64_t ne_src1 = ggml_nelements(src1);
        src1_f16_alloc.alloc(ne_src1);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
    }
    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();

    ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
    char * dst_t;

    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
    cudaDataType_t      cu_data_type    = CUDA_R_16F;

    // dst strides
    size_t nbd2 = dst->nb[2];
    size_t nbd3 = dst->nb[3];

    const half  alpha_f16 = 1.0f;
    const half  beta_f16  = 0.0f;

    const float alpha_f32 = 1.0f;
    const float beta_f32  = 0.0f;

    const void * alpha = &alpha_f16;
    const void * beta  = &beta_f16;

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        dst_t = (char *) dst_f16.alloc(ne_dst);

        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
    } else {
        dst_t = (char *) dst_ddf;

        cu_compute_type = CUBLAS_COMPUTE_32F;
        cu_data_type    = CUDA_R_32F;

        alpha = &alpha_f32;
        beta  = &beta_f32;
    }

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

#if 0
    // use cublasGemmEx
    {
        for (int i13 = 0; i13 < ne13; ++i13) {
            for (int i12 = 0; i12 < ne12; ++i12) {
                int i03 = i13 / r3;
                int i02 = i12 / r2;

                CUBLAS_CHECK(
                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                            ne01, ne11, ne10,
                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
                            cu_compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
            }
        }
    }
#else
    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    } else {
        // use cublasGemmBatchedEx
        const int ne23 = ne12*ne13;

        ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
        ggml_cuda_pool_alloc<      void *> ptrs_dst(ctx.pool(), 1*ne23);

        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
                src0_f16, src1_f16, dst_t,
                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
                ne23,
                nb02, nb03,
                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
                nbd2, nbd3,
                r2, r3);
        CUDA_CHECK(cudaGetLastError());

        CUBLAS_CHECK(
        cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
                ne23,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    }
#endif

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
    }
}

static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);

    int64_t min_compute_capability = INT_MAX;

    bool any_pascal_with_slow_fp16 = false;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
            // skip devices that are not going to do any work:
            if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
                continue;
            }

            if (min_compute_capability > get_cuda_global_info().devices[id].cc) {
                min_compute_capability = get_cuda_global_info().devices[id].cc;
            }
            if (get_cuda_global_info().devices[id].cc == 610) {
                any_pascal_with_slow_fp16 = true;
            }
        }
    } else {
        min_compute_capability    = get_cuda_global_info().devices[ctx.device].cc;
        any_pascal_with_slow_fp16 = get_cuda_global_info().devices[ctx.device].cc == 610;
    }

    // check data types and tensor shapes for custom matrix multiplication kernels:
    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;

    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;

#ifdef CUDA_USE_TENSOR_CORES
    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES

#else

    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;

    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;

#ifdef CUDA_USE_TENSOR_CORES
    // when tensor cores are available, use them for large batch size
    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // CUDA_USE_TENSOR_CORES

#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    // if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

    if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
    } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
    } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
    } else {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    }
}

#if 0
template<typename ... Srcs>
static __global__ void k_compute_batched_ptrs_id(
        const void ** ptrs_src, void ** ptrs_dst,
        int ne12, int ne13,
        int ne23,
        int nb02, int nb03,
        int nb12, int nb13,
        int nb2, int nb3,
        int r2, int r3,
        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
        const half * src1_f16, half * dst_f16,
        const int32_t * ids, const int id,
        Srcs... src0s) {

    int i = ids[id];

    half * src0_f16;
    const void * srcs_ar[] = { (const half *) src0s... };
    if (src0_type == GGML_TYPE_F16) {
        src0_f16 = (half *) srcs_ar[i];
    } else {
        src0_f16 = src0_as_f16;
        if (threadIdx.x == 0 && threadIdx.y == 0) {
            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
        }
    }

    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
    int i12 = blockIdx.y * blockDim.y + threadIdx.y;

    if (i13 >= ne13 || i12 >= ne12) {
        return;
    }

    int i03 = i13 / r3;
    int i02 = i12 / r2;

    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
}

static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
    const struct ggml_tensor * ids = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * src00 = dst->src[2];

    const int id = dst->op_params[0];

    GGML_ASSERT(!ggml_is_transposed(src00));
    GGML_ASSERT(!ggml_is_transposed(src1));

    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
    const int64_t ne01 = src00->ne[1];
    const int64_t ne02 = src00->ne[2];
    const int64_t ne03 = src00->ne[3];

    //const int64_t nb01 = src00->nb[1];
    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];

    //const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);

    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));

    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    //void * src0_ddq = src0_extra->data_device[g_main_device];
    //half * src0_as_f16 = (half *) src0_ddq;

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    // convert src1 to fp16
    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
    GGML_ASSERT(to_fp16_cuda != nullptr);

    size_t src1_as = 0;
    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);

    size_t dst_as = 0;
    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

    const half alpha_f16 = 1.0f;
    const half beta_f16  = 0.0f;

    // use cublasGemmBatchedEx
    const int ne23 = ne12*ne13;

    const void ** ptrs_src = nullptr;
          void ** ptrs_dst = nullptr;

    size_t ptrs_src_s = 0;
    size_t ptrs_dst_s = 0;

    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);

    int64_t src0_ne = ggml_nelements(src00);
    half * src0_as_f16 = nullptr;
    size_t src0_as = 0;
    if (src00->type != GGML_TYPE_F16) {
        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
    }

    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
    dim3 block_dims(ne13, ne12);
    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
            ptrs_src, ptrs_dst,
            ne12, ne13,
            ne23,
            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
            nb12, nb13,
            dst->nb[2], dst->nb[3],
            r2, r3,
            src00->type, src0_as_f16, src0_ne,
            src1_as_f16, dst_f16,
            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
    );
    CUDA_CHECK(cudaGetLastError());

    CUBLAS_CHECK(
    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
            ne01, ne11, ne10,
            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
            ne23,
            CUBLAS_COMPUTE_16F,
            CUBLAS_GEMM_DEFAULT_TENSOR_OP));

    if (src0_as != 0) {
        ggml_cuda_pool_free(src0_as_f16, src0_as);
    }
    if (ptrs_src_s != 0) {
        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
    }
    if (ptrs_dst_s != 0) {
        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
    }

    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);

    ggml_cuda_pool_free(src1_as_f16, src1_as);
    ggml_cuda_pool_free(dst_f16, dst_as);
}
#endif

static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#if 0
    ggml_cuda_mul_mat_id_cublas(dst);
    // TODO: mmq/mmv support
#endif
    cudaStream_t stream = ctx.stream();

    const size_t nb11 = src1->nb[1];
    const size_t nb1  =  dst->nb[1];

    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
    const int32_t n_as = ((int32_t *) dst->op_params)[1];

    std::vector<char> ids_host(ggml_nbytes(ids));
    const char * ids_dev = (const char *) ids->data;
    CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;

    char * src1_original = (char *) src1->data;
    char * dst_original  = (char *)  dst->data;

    if (src1->ne[1] == 1) {
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

            GGML_ASSERT(row_id >= 0 && row_id < n_as);

            const struct ggml_tensor * src0_row = dst->src[row_id + 2];

            src1_row.data = src1_original + i01*src1->nb[1];
            dst_row.data  =  dst_original + i01*dst->nb[1];

            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
        }
    } else {
        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
        ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));

        src1_row.data = src1_contiguous.get();
        dst_row.data  =  dst_contiguous.get();

        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
            const struct ggml_tensor * src0_row = dst->src[row_id + 2];

            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

                if (row_id_i != row_id) {
                    continue;
                }

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
                                        nb11, cudaMemcpyDeviceToDevice, stream));
                num_src1_rows++;
            }

            if (num_src1_rows == 0) {
                continue;
            }

            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;

            src1_row.nb[1] = nb11;
            src1_row.nb[2] = num_src1_rows*nb11;
            src1_row.nb[3] = num_src1_rows*nb11;

            dst_row.nb[1] = nb1;
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;

            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);

            num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

                if (row_id_i != row_id) {
                    continue;
                }

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
                                        nb1, cudaMemcpyDeviceToDevice, stream));
                num_src1_rows++;
            }
        }
    }
}

static void ggml_cuda_scale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_scale);
}

static void ggml_cuda_clamp(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_clamp);
}

static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));

    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    //GGML_ASSERT(src0->ne[3] == 1);

    const int64_t nb00 = src0->nb[0];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
    const int64_t nb03 = src0->nb[3];

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];

    //GGML_ASSERT(src1->ne[3] == 1);

    const int64_t nb10 = src1->nb[0];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2];
    const int64_t nb13 = src1->nb[3];

    ggml_cuda_set_device(ctx.device);
    cudaStream_t main_stream = ctx.stream();

    char * src0_ddc = (char *) src0->data;
    char * src1_ddc = (char *) src1->data;

    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
        GGML_ASSERT(false);
    }

    GGML_UNUSED(dst);
}

static void ggml_cuda_dup(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // TODO: why do we pass dst as src1 here?
    ggml_cuda_cpy(ctx, src0, dst, nullptr);
    GGML_UNUSED(src1);
}

static void ggml_cuda_diag_mask_inf(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_diag_mask_inf);
}

static void ggml_cuda_soft_max(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_soft_max);
}

static void ggml_cuda_rope(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rope);
}

static void ggml_cuda_alibi(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_alibi);
}

static void ggml_cuda_pool2d(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pool2d);
}

static void ggml_cuda_im2col(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_im2col);
}

static void ggml_cuda_sum_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0));
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sum_rows);
}

static void ggml_cuda_argsort(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0));
    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_argsort);
}

static void ggml_cuda_nop(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_UNUSED(ctx);
    GGML_UNUSED(src0);
    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
}

static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * tensor) {
    // FIXME: where should this be?
    if (tensor->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(tensor->src[0]->buffer)) {
        ggml_cuda_set_peer_access(tensor->src[1]->ne[1], ctx.device);
    }

    ggml_cuda_func_t func;

    switch (tensor->op) {
        case GGML_OP_REPEAT:
            func = ggml_cuda_repeat;
            break;
        case GGML_OP_GET_ROWS:
            func = ggml_cuda_get_rows;
            break;
        case GGML_OP_DUP:
            func = ggml_cuda_dup;
            break;
        case GGML_OP_ADD:
            func = ggml_cuda_add;
            break;
        case GGML_OP_ACC:
            func = ggml_cuda_acc;
            break;
        case GGML_OP_MUL:
            func = ggml_cuda_mul;
            break;
        case GGML_OP_DIV:
            func = ggml_cuda_div;
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(tensor)) {
                case GGML_UNARY_OP_GELU:
                    func = ggml_cuda_gelu;
                    break;
                case GGML_UNARY_OP_SILU:
                    func = ggml_cuda_silu;
                    break;
                case GGML_UNARY_OP_GELU_QUICK:
                    func = ggml_cuda_gelu_quick;
                    break;
                case GGML_UNARY_OP_TANH:
                    func = ggml_cuda_tanh;
                    break;
                case GGML_UNARY_OP_RELU:
                    func = ggml_cuda_relu;
                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
                    func = ggml_cuda_hardsigmoid;
                    break;
                case GGML_UNARY_OP_HARDSWISH:
                    func = ggml_cuda_hardswish;
                    break;
                default:
                    return false;
            }
            break;
        case GGML_OP_NORM:
            func = ggml_cuda_norm;
            break;
        case GGML_OP_GROUP_NORM:
            func = ggml_cuda_group_norm;
            break;
        case GGML_OP_CONCAT:
            func = ggml_cuda_concat;
            break;
        case GGML_OP_UPSCALE:
            func = ggml_cuda_upscale;
            break;
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
        case GGML_OP_ARANGE:
            func = ggml_cuda_arange;
            break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            func = ggml_cuda_timestep_embedding;
            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
        case GGML_OP_RMS_NORM:
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
            if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
                fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
                return false;
            } else {
                func = ggml_cuda_mul_mat;
            }
            break;
        case GGML_OP_MUL_MAT_ID:
            func = ggml_cuda_mul_mat_id;
            break;
        case GGML_OP_SCALE:
            func = ggml_cuda_scale;
            break;
        case GGML_OP_SQR:
            func = ggml_cuda_sqr;
            break;
        case GGML_OP_CLAMP:
            func = ggml_cuda_clamp;
            break;
        case GGML_OP_CPY:
            func = ggml_cuda_cpy;
            break;
        case GGML_OP_CONT:
            func = ggml_cuda_dup;
            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
            func = ggml_cuda_nop;
            break;
        case GGML_OP_DIAG_MASK_INF:
            func = ggml_cuda_diag_mask_inf;
            break;
        case GGML_OP_SOFT_MAX:
            func = ggml_cuda_soft_max;
            break;
        case GGML_OP_ROPE:
            func = ggml_cuda_rope;
            break;
        case GGML_OP_ALIBI:
            func = ggml_cuda_alibi;
            break;
        case GGML_OP_IM2COL:
            func = ggml_cuda_im2col;
            break;
        case GGML_OP_POOL_2D:
            func = ggml_cuda_pool2d;
            break;
        case GGML_OP_SUM_ROWS:
            func = ggml_cuda_sum_rows;
            break;
        case GGML_OP_ARGSORT:
            func = ggml_cuda_argsort;
            break;
        default:
            return false;
    }

    func(ctx, tensor->src[0], tensor->src[1], tensor);
    return true;
}


////////////////////////////////////////////////////////////////////////////////


// backend

GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return cuda_ctx->name.c_str();
}

GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    delete cuda_ctx;
    delete backend;
}

GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
}

GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");

    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}

GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");

    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}

GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));

    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
        return false;
    }

    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
        return false;
    }

    // device -> device
    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;

    if (backend_src != backend_dst) {
        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;

        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);

        if (!cuda_ctx_src->copy_event) {
            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
        }

        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
        } else {
            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
        }

        // record event on src stream
        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));

        // wait on dst stream for the copy to complete
        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
    } else {
        // src and dst are on the same backend
        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
    }
    return true;
}

GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));

    GGML_UNUSED(backend);
}

GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_device(cuda_ctx->device);

    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
            continue;
        }

#ifndef NDEBUG
        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] != nullptr) {
                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
            }
        }
#endif

        bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
        if (!ok) {
            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
        }
        GGML_ASSERT(ok);
    }

    return GGML_STATUS_SUCCESS;
}

GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                    return true;
                default:
                    return false;
            }
            break;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
                struct ggml_tensor * a;
                struct ggml_tensor * b;
                if (op->op == GGML_OP_MUL_MAT) {
                    a = op->src[0];
                    b = op->src[1];
                } else {
                    a = op->src[2];
                    b = op->src[1];
                }
                if (a->ne[3] != b->ne[3]) {
                    return false;
                }
                ggml_type a_type = a->type;
                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                        return false;
                    }
                }
                return true;
            } break;
        case GGML_OP_GET_ROWS:
            {
                switch (op->src[0]->type) {
                    case GGML_TYPE_F16:
                    case GGML_TYPE_F32:
                    case GGML_TYPE_Q4_0:
                    case GGML_TYPE_Q4_1:
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                        return true;
                    default:
                        return false;
                }
            } break;
        case GGML_OP_CPY:
            {
                ggml_type src0_type = op->src[0]->type;
                ggml_type src1_type = op->src[1]->type;
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                return false;
            } break;
        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_CONCAT:
            {
                ggml_type src0_type = op->src[0]->type;
                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ROPE:
        case GGML_OP_ALIBI:
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default:
            return false;
    }

    GGML_UNUSED(backend);
}

GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;

    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;

    GGML_UNUSED(backend);
}

static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_device(cuda_ctx->device);

    cudaEvent_t event;
    CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

    return new ggml_backend_event {
        /* .backend = */ backend,
        /* .context = */ event,
    };
}

static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
    CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));

    delete event;
}

static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;

    CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
}

static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    if (ggml_backend_is_cuda(event->backend)) {
        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
    } else {
#if 0
        // untested
        auto wait_fn = [](void * user_data) {
            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
            ggml_backend_event_synchronize(event);
        };

        CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
#endif
        GGML_ASSERT(false);
    }
}

static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}

static ggml_backend_i ggml_backend_cuda_interface = {
    /* .get_name                = */ ggml_backend_cuda_name,
    /* .free                    = */ ggml_backend_cuda_free,
    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
    /* .synchronize             = */ ggml_backend_cuda_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
    /* .supports_op             = */ ggml_backend_cuda_supports_op,
    /* .offload_op              = */ ggml_backend_cuda_offload_op,
    /* .event_new               = */ ggml_backend_cuda_event_new,
    /* .event_free              = */ ggml_backend_cuda_event_free,
    /* .event_record            = */ ggml_backend_cuda_event_record,
    /* .event_wait              = */ ggml_backend_cuda_event_wait,
    /* .event_synchronize       = */ ggml_backend_cuda_event_synchronize,
};

static ggml_guid_t ggml_backend_cuda_guid() {
    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
    return &guid;
}

GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
    if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
        return nullptr;
    }

    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
    if (ctx == nullptr) {
        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
        return nullptr;
    }

    ggml_backend_t cuda_backend = new ggml_backend {
        /* .guid      = */ ggml_backend_cuda_guid(),
        /* .interface = */ ggml_backend_cuda_interface,
        /* .context   = */ ctx
    };

    return cuda_backend;
}

GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
}

GGML_CALL int ggml_backend_cuda_get_device_count() {
    return get_cuda_global_info().device_count;
}

GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
    cudaDeviceProp prop;
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
    snprintf(description, description_size, "%s", prop.name);
}

GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
    ggml_cuda_set_device(device);

    CUDA_CHECK(cudaMemGetInfo(free, total));
}

GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
        return false;
    }

    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();

        fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                size/1024.0/1024.0, cudaGetErrorString(err));
        return false;
    }
    return true;
}

GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
        return;
    }

    cudaError_t err = cudaHostUnregister(buffer);
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();
    }
}

// backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
    return cuda_backend;

    GGML_UNUSED(params);
}

extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();

GGML_CALL int ggml_backend_cuda_reg_devices() {
    int device_count = ggml_backend_cuda_get_device_count();
    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
    for (int i = 0; i < device_count; i++) {
        char name[128];
        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
    }
    return device_count;
}
-												ggml : always define ggml_fp16_t as uint16_t (#5666)

* ggml : always define ggml_fp16_t as uint16_t

ggml-ci

* ggml : cont

ggml-ci

* ggml : cont

* ggml : cont

ggml-ci

* ggml : cont

ggml-ci

* cuda : no longer ggml headers last

ggml-ci

* ggml : fix q6_K FP16 -> FP32 conversion

ggml-ci

* ggml : more FP16 -> FP32 conversion fixes

ggml-ci
											
										
										
											2024-02-22 21:21:39 +00:00
+								#include "ggml-cuda.h"
 								#include "ggml.h"
 								#include "ggml-backend-impl.h"
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 12:27:20 +00:00
+								#if defined(GGML_USE_HIPBLAS)
 								#define GGML_COMMON_DECL_HIP
 								#define GGML_COMMON_IMPL_HIP
 								#else
 								#define GGML_COMMON_DECL_CUDA
-												ggml : add ggml-common.h to deduplicate shared code (#5940)

* ggml : add ggml-common.h to shared code

ggml-ci

* scripts : update sync scripts

* sycl : reuse quantum tables

ggml-ci

* ggml : minor

* ggml : minor

* sycl : try to fix build
											
										
										
											2024-03-09 10:47:57 +00:00
+								#define GGML_COMMON_IMPL_CUDA
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 12:27:20 +00:00
+								#endif
-												ggml : add ggml-common.h to deduplicate shared code (#5940)

* ggml : add ggml-common.h to shared code

ggml-ci

* scripts : update sync scripts

* sycl : reuse quantum tables

ggml-ci

* ggml : minor

* ggml : minor

* sycl : try to fix build
											
										
										
											2024-03-09 10:47:57 +00:00
+								#include "ggml-common.h"
-												llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params

ggml-ci

* fix metal build

* fix freq_base/scale default to model value

* llama-bench : keep the same model between tests when possible

* move n_threads to llama_context_params, add n_threads_batch

* fix mpi build

* remove kv_size(), cuda scratch fixes

* remove low-vram option

* add n_threads_batch to system info, refactor to get_system_info()

* add documentation about --threads-batch to the READMEs

* llama-bench fix

* main : fix rope freq/scale warning

* llama.cpp : add llama_get_model
common : add llama_tokenize from model

* remove duplicated ctx/model functions

ggml-ci

* cuda : print total VRAM used
											
										
										
											2023-09-28 19:42:38 +00:00
+								#include <algorithm>
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#include <array>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								#include <assert.h>
 								#include <atomic>
 								#include <cinttypes>
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								#include <cstddef>
 								#include <cstdint>
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								#include <float.h>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								#include <limits>
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#include <map>
 								#include <memory>
 								#include <mutex>
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 01:14:14 +00:00
+								#include <stdint.h>
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
+								#include <stdio.h>
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								#include <string>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								#include <vector>
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 01:14:14 +00:00
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
+								// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
 								#define STRINGIZE_IMPL(...) #__VA_ARGS__
 								#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#if defined(GGML_USE_HIPBLAS)
 								#include <hip/hip_runtime.h>
 								#include <hipblas/hipblas.h>
 								#include <hip/hip_fp16.h>
 								#ifdef __HIP_PLATFORM_AMD__
 								// for rocblas_initialize()
 								#include "rocblas/rocblas.h"
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#endif // __HIP_PLATFORM_AMD__
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
+								#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 								#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 								#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
+								#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define CUBLAS_OP_N HIPBLAS_OP_N
 								#define CUBLAS_OP_T HIPBLAS_OP_T
 								#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 								#define CUBLAS_TF32_TENSOR_OP_MATH 0
 								#define CUDA_R_16F  HIPBLAS_R_16F
 								#define CUDA_R_32F  HIPBLAS_R_32F
 								#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-												ggml-cuda: Fix HIP build (#4528)

regression of #4490
Adds defines for two new datatypes
cublasComputeType_t, cudaDataType_t.

Currently using deprecated hipblasDatatype_t since newer ones very recent.
											
										
										
											2023-12-18 21:33:45 +00:00
+								#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cublasCreate hipblasCreate
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define cublasDestroy hipblasDestroy
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cublasGemmEx hipblasGemmEx
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
+								#define cublasGemmBatchedEx hipblasGemmBatchedEx
 								#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cublasHandle_t hipblasHandle_t
 								#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 								#define cublasSetStream hipblasSetStream
 								#define cublasSgemm hipblasSgemm
 								#define cublasStatus_t hipblasStatus_t
-												ggml-cuda: Fix HIP build (#4528)

regression of #4490
Adds defines for two new datatypes
cublasComputeType_t, cudaDataType_t.

Currently using deprecated hipblasDatatype_t since newer ones very recent.
											
										
										
											2023-12-18 21:33:45 +00:00
+								#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 14:37:53 +00:00
+								#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 								#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 								#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaDeviceProp hipDeviceProp_t
 								#define cudaDeviceSynchronize hipDeviceSynchronize
 								#define cudaError_t hipError_t
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 22:40:26 +00:00
+								#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 								#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaEventCreateWithFlags hipEventCreateWithFlags
 								#define cudaEventDisableTiming hipEventDisableTiming
 								#define cudaEventRecord hipEventRecord
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								#define cudaEventSynchronize hipEventSynchronize
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaEvent_t hipEvent_t
 								#define cudaEventDestroy hipEventDestroy
 								#define cudaFree hipFree
 								#define cudaFreeHost hipHostFree
 								#define cudaGetDevice hipGetDevice
 								#define cudaGetDeviceCount hipGetDeviceCount
 								#define cudaGetDeviceProperties hipGetDeviceProperties
 								#define cudaGetErrorString hipGetErrorString
 								#define cudaGetLastError hipGetLastError
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								#define cudaHostRegister hipHostRegister
 								#define cudaHostRegisterPortable hipHostRegisterPortable
 								#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 								#define cudaHostUnregister hipHostUnregister
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								#define cudaLaunchHostFunc hipLaunchHostFunc
-												cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)

* AMD ROCm: handle UMA memory VRAM expansions

This resolves #2797 by allowing ROCm AMD GPU users with a UMA to
dynamically expand the VRAM allocated to the GPU.

Without this, AMD ROCm users with shared CPU/GPU memory usually are
stuck with the BIOS-set (or fixed) framebuffer VRAM, making it
impossible to load more than 1-2 layers.

Note that the model is duplicated in RAM because it's loaded once for
the CPU and then copied into a second set of allocations that are
managed by the HIP UMA system. We can fix this later.

* clarify build process for ROCm on linux with cmake

* avoid using deprecated ROCm hipMallocHost

* keep simplifying the change required for UMA

* cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON
											
										
										
											2023-12-21 19:45:32 +00:00
+								#ifdef GGML_HIP_UMA
 								#define cudaMalloc hipMallocManaged
 								#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
 								#else
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaMalloc hipMalloc
 								#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-												cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)

* AMD ROCm: handle UMA memory VRAM expansions

This resolves #2797 by allowing ROCm AMD GPU users with a UMA to
dynamically expand the VRAM allocated to the GPU.

Without this, AMD ROCm users with shared CPU/GPU memory usually are
stuck with the BIOS-set (or fixed) framebuffer VRAM, making it
impossible to load more than 1-2 layers.

Note that the model is duplicated in RAM because it's loaded once for
the CPU and then copied into a second set of allocations that are
managed by the HIP UMA system. We can fix this later.

* clarify build process for ROCm on linux with cmake

* avoid using deprecated ROCm hipMallocHost

* keep simplifying the change required for UMA

* cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON
											
										
										
											2023-12-21 19:45:32 +00:00
+								#endif
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaMemcpy hipMemcpy
 								#define cudaMemcpyAsync hipMemcpyAsync
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
 								#define cudaMemcpy2DAsync hipMemcpy2DAsync
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 								#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 								#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 								#define cudaMemcpyKind hipMemcpyKind
 								#define cudaMemset hipMemset
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								#define cudaMemsetAsync hipMemsetAsync
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								#define cudaMemGetInfo hipMemGetInfo
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 								#define cudaSetDevice hipSetDevice
 								#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define cudaStreamDestroy hipStreamDestroy
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								#define cudaStreamFireAndForget hipStreamFireAndForget
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaStreamNonBlocking hipStreamNonBlocking
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								#define cudaStreamPerThread hipStreamPerThread
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaStreamSynchronize hipStreamSynchronize
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 14:55:43 +00:00
+								#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#define cudaStream_t hipStream_t
 								#define cudaSuccess hipSuccess
-												ggml-cuda: Fix HIP build by adding define for __trap (#4569)

Regression of 139882392258671ffe5acdfcadc0bc08572d6eef
HIP doesn't have trap, only abort
											
										
										
											2023-12-21 19:13:25 +00:00
+								#define __trap abort
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 								#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
 								#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
 								#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
 								#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
 								#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
 								#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
 								#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 								#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
+								#else
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								#include <cuda_runtime.h>
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								#include <cuda.h>
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								#include <cublas_v2.h>
 								#include <cuda_fp16.h>
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
 								#if CUDART_VERSION < 11020
-												Fix new CUDA10 compilation errors (#4635)


											
										
										
											2023-12-26 10:38:36 +00:00
+								#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-												cuda : fix jetson compile error (#4560)

* fix old jetson compile error

* Update Makefile

* update jetson detect and cuda version detect

* update cuda marco define

* update makefile and cuda,fix some issue

* Update README.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update Makefile

* Update README.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-22 15:11:12 +00:00
+								#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
 								#define CUBLAS_COMPUTE_16F CUDA_R_16F
 								#define CUBLAS_COMPUTE_32F CUDA_R_32F
 								#define cublasComputeType_t cudaDataType_t
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								#endif // CUDART_VERSION < 11020
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#endif // defined(GGML_USE_HIPBLAS)
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												CUDA: fix softmax compile for old CUDA versions (#4862)


											
										
										
											2024-01-12 11:30:41 +00:00
+								#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
+								#define CC_PASCAL     600
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 16:12:57 +00:00
+								#define CC_VOLTA      700
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#define CC_OFFSET_AMD 1000000
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 12:52:01 +00:00
+								#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 16:12:57 +00:00
+								#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 12:52:01 +00:00
+								#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 12:52:01 +00:00
+								// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
 								// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
 								// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
 								// -  7B quantum model: +100-200 MB
 								// - 13B quantum model: +200-400 MB
 								//
 								//#define GGML_CUDA_FORCE_MMQ
 								// TODO: improve this to be correct for more hardware
 								//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
 								#if !defined(GGML_CUDA_FORCE_MMQ)
 								#define CUDA_USE_TENSOR_CORES
 								#endif
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
+								#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
 								#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 12:52:01 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												ggml : fix warnings under MSVC (#1908)


											
										
										
											2023-06-17 15:46:15 +00:00
+								#if defined(_MSC_VER)
 								#pragma warning(disable: 4244 4267) // possible loss of data
 								#endif
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								[[noreturn]]
 								static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
 								    int id = -1; // in case cudaGetDevice fails
 								    cudaGetDevice(&id);
 								    fprintf(stderr, "CUDA error: %s\n", msg);
 								    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
 								    fprintf(stderr, "  %s\n", stmt);
 								    // abort with GGML_ASSERT to get a stack trace
 								    GGML_ASSERT(!"CUDA error");
 								}
 								#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
 								     do {                                                                           \
 								        auto err_ = (err);                                                          \
 								        if (err_ != (success)) {                                                    \
 								            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
 								        }                                                                           \
 								    } while (0)
 								#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
-												Fixed CUDA runtime version check (#1879)


											
										
										
											2023-06-15 19:49:08 +00:00
+								#if CUDART_VERSION >= 12000
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    static const char * cublas_get_error_str(const cublasStatus_t err) {
 								        return cublasGetStatusString(err);
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								#else
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    static const char * cublas_get_error_str(const cublasStatus_t err) {
 								        switch (err) {
 								            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
 								            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
 								            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
 								            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
 								            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
 								            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
 								            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
 								            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
 								            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
 								            default: return "unknown error";
 								        }
 								    }
 								#endif // CUDART_VERSION >= 12000
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
 								#if !defined(GGML_USE_HIPBLAS)
 								static const char * cu_get_error_str(CUresult err) {
 								    const char * err_str;
 								    cuGetErrorString(err, &err_str);
 								    return err_str;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								#endif
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 14:55:43 +00:00
+								#if CUDART_VERSION >= 11100
 								#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 								#else
 								#define GGML_CUDA_ASSUME(x)
 								#endif // CUDART_VERSION >= 11100
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define GGML_CUDA_MAX_STREAMS 8
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct ggml_tensor_extra_gpu {
 								    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
 								    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
 								};
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// this is faster on Windows
 								// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
 								static void ggml_cuda_set_device(const int device) {
 								    int current_device;
 								    CUDA_CHECK(cudaGetDevice(&current_device));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (device == current_device) {
 								        return;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUDA_CHECK(cudaSetDevice(device));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static int ggml_cuda_get_device() {
 								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    return id;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct ggml_cuda_device_info {
 								    int device_count;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 08:23:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    struct cuda_device_info {
 								        int     cc;                 // compute capability
 								        size_t  smpb;               // max. shared memory per block
 								        bool    vmm;                // virtual memory support
 								        size_t  vmm_granularity;    // granularity of virtual memory
 								        size_t  total_vram;
 								    };
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
 								};
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static ggml_cuda_device_info ggml_cuda_init() {
 								#ifdef __HIP_PLATFORM_AMD__
 								    // Workaround for a rocBLAS bug when using multiple graphics cards:
 								    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
 								    rocblas_initialize();
 								    CUDA_CHECK(cudaDeviceSynchronize());
 								#endif
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 21:07:29 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_device_info info = {};
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 21:07:29 +00:00
-												cuda : print the returned error when CUDA initialization fails (#6185)


											
										
										
											2024-03-20 20:03:26 +00:00
+								    cudaError_t err = cudaGetDeviceCount(&info.device_count);
 								    if (err != cudaSuccess) {
 								        fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        return info;
 								    }
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t total_vram = 0;
 								#if defined(GGML_CUDA_FORCE_MMQ)
 								    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
 								#endif
 								#if defined(CUDA_USE_TENSOR_CORES)
 								    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
 								#else
 								    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								#endif
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
 								    for (int id = 0; id < info.device_count; ++id) {
 								        int device_vmm = 0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if !defined(GGML_USE_HIPBLAS)
 								        CUdevice device;
 								        CU_CHECK(cuDeviceGet(&device, id));
 								        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 14:37:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (device_vmm) {
 								            CUmemAllocationProp alloc_prop = {};
 								            alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 								            alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 								            alloc_prop.location.id = id;
 								            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
 								        }
 								#endif // !defined(GGML_USE_HIPBLAS)
 								        info.devices[id].vmm = !!device_vmm;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        cudaDeviceProp prop;
 								        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
 								        fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        info.default_tensor_split[id] = total_vram;
 								        total_vram += prop.totalGlobalMem;
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 19:49:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								        info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 								#else
 								        info.devices[id].cc = 100*prop.major + 10*prop.minor;
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								        info.devices[id].smpb = prop.sharedMemPerBlock;
 								    }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < info.device_count; ++id) {
 								        info.default_tensor_split[id] /= total_vram;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // configure logging to stdout
 								    // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 								    return info;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static const ggml_cuda_device_info & get_cuda_global_info() {
 								    static ggml_cuda_device_info info = ggml_cuda_init();
 								    return info;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// #define DEBUG_CUDA_MALLOC
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// buffer pool for cuda (legacy)
 								struct ggml_cuda_pool {
 								    virtual ~ggml_cuda_pool() = default;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    virtual void * alloc(size_t size, size_t * actual_size) = 0;
 								    virtual void free(void * ptr, size_t size) = 0;
 								};
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 								    static const int MAX_BUFFERS = 256;
-												cuda : replace asserts in wrong architecture checks with __trap (#4556)

* cuda : replace asserts in wrong architecture checks with __trap

* make bad_arch noreturn, remove returns
											
										
										
											2023-12-21 17:02:30 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int device;
 								    struct ggml_cuda_buffer {
 								        void * ptr = nullptr;
 								        size_t size = 0;
 								    };
-												cuda : replace asserts in wrong architecture checks with __trap (#4556)

* cuda : replace asserts in wrong architecture checks with __trap

* make bad_arch noreturn, remove returns
											
										
										
											2023-12-21 17:02:30 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
 								    size_t pool_size = 0;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    explicit ggml_cuda_pool_leg(int device) :
 								        device(device) {
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ~ggml_cuda_pool_leg() {
 								        ggml_cuda_set_device(device);
 								        for (int i = 0; i < MAX_BUFFERS; ++i) {
 								            ggml_cuda_buffer & b = buffer_pool[i];
 								            if (b.ptr != nullptr) {
 								                CUDA_CHECK(cudaFree(b.ptr));
 								                pool_size -= b.size;
 								            }
 								        }
 								        GGML_ASSERT(pool_size == 0);
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
+								    }
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void * alloc(size_t size, size_t * actual_size) override {
 								#ifdef DEBUG_CUDA_MALLOC
 								        int nnz = 0;
 								        size_t max_size = 0;
 								#endif
 								        size_t best_diff = 1ull << 36;
 								        int ibest = -1;
 								        for (int i = 0; i < MAX_BUFFERS; ++i) {
 								            ggml_cuda_buffer& b = buffer_pool[i];
 								            if (b.ptr != nullptr) {
 								#ifdef DEBUG_CUDA_MALLOC
 								                ++nnz;
 								                if (b.size > max_size) max_size = b.size;
 								#endif
 								                if (b.size >= size) {
 								                    size_t diff = b.size - size;
 								                    if (diff < best_diff) {
 								                        best_diff = diff;
 								                        ibest = i;
 								                        if (!best_diff) {
 								                            void * ptr = b.ptr;
 								                            *actual_size = b.size;
 								                            b.ptr = nullptr;
 								                            b.size = 0;
 								                            return ptr;
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								        if (ibest >= 0) {
 								            ggml_cuda_buffer& b = buffer_pool[ibest];
 								            void * ptr = b.ptr;
 								            *actual_size = b.size;
 								            b.ptr = nullptr;
 								            b.size = 0;
 								            return ptr;
 								        }
 								        void * ptr;
 								        size_t look_ahead_size = (size_t) (1.05 * size);
 								        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
 								        ggml_cuda_set_device(device);
 								        CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
 								        *actual_size = look_ahead_size;
 								        pool_size += look_ahead_size;
 								#ifdef DEBUG_CUDA_MALLOC
 								        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
 								                (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
 								#endif
 								        return ptr;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void free(void * ptr, size_t size) override {
 								        for (int i = 0; i < MAX_BUFFERS; ++i) {
 								            ggml_cuda_buffer& b = buffer_pool[i];
 								            if (b.ptr == nullptr) {
 								                b.ptr = ptr;
 								                b.size = size;
 								                return;
 								            }
 								        }
 								        fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
 								        ggml_cuda_set_device(device);
 								        CUDA_CHECK(cudaFree(ptr));
 								        pool_size -= size;
 								    }
 								};
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// pool with virtual memory
 								#if !defined(GGML_USE_HIPBLAS)
 								struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 								    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int device;
 								    CUdeviceptr pool_addr = 0;
 								    size_t pool_used = 0;
 								    size_t pool_size = 0;
 								    size_t granularity;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    explicit ggml_cuda_pool_vmm(int device) :
 								        device(device),
 								        granularity(get_cuda_global_info().devices[device].vmm_granularity) {
 								    }
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 16:35:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ~ggml_cuda_pool_vmm() {
 								        if (pool_addr != 0) {
 								            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
 								            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
 								        }
 								    }
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 16:35:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void * alloc(size_t size, size_t * actual_size) override {
 								        // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
 								        const size_t alignment = 128;
 								        size = alignment * ((size + alignment - 1) / alignment);
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 11:49:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        size_t avail = pool_size - pool_used;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (size > avail) {
 								            // round up to the next multiple of the granularity
 								            size_t reserve_size = size - avail;
 								            reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // allocate more physical memory
 								            CUmemAllocationProp prop = {};
 								            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 								            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 								            prop.location.id = device;
 								            CUmemGenericAllocationHandle handle;
 								            CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // reserve virtual address space (if not already reserved)
 								            if (pool_addr == 0) {
 								                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
 								            }
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 11:49:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // map at the end of the pool
 								            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // the memory allocation handle is no longer needed after mapping
 								            CU_CHECK(cuMemRelease(handle));
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // set access
 								            CUmemAccessDesc access = {};
 								            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 								            access.location.id = device;
 								            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 								            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // add to the pool
 								            pool_size += reserve_size;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
 								            //       id, (unsigned long long) (pool_size[id]/1024/1024),
 								            //       (unsigned long long) (reserve_size/1024/1024));
 								        }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        GGML_ASSERT(pool_addr != 0);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        void * ptr = (void *) (pool_addr + pool_used);
 								        *actual_size = size;
 								        pool_used += size;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef DEBUG_CUDA_MALLOC
 								        printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
 								#endif
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        return ptr;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void free(void * ptr, size_t size) override {
 								#ifdef DEBUG_CUDA_MALLOC
 								        printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
 								#endif
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        pool_used -= size;
 								        // all deallocations must be in reverse order of the allocations
 								        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								};
 								#endif // !defined(GGML_USE_HIPBLAS)
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T>
 								struct ggml_cuda_pool_alloc {
 								    ggml_cuda_pool * pool = nullptr;
 								    T * ptr = nullptr;
 								    size_t actual_size = 0;
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool_alloc() = default;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
 								        alloc(size);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ~ggml_cuda_pool_alloc() {
 								        if (ptr != nullptr) {
 								            pool->free(ptr, actual_size);
 								        }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // size is in number of elements
 								    T * alloc(size_t size) {
 								        GGML_ASSERT(pool != nullptr);
 								        GGML_ASSERT(ptr == nullptr);
 								        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
 								        return ptr;
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    T * alloc(ggml_cuda_pool & pool, size_t size) {
 								        this->pool = &pool;
 								        return alloc(size);
 								    }
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    T * get() {
 								        return ptr;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
 								    ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
 								    ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
 								    ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
 								};
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// backend interface
 								struct ggml_backend_cuda_context {
 								    int device;
 								    std::string name;
 								    cudaEvent_t copy_event = nullptr;
 								    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
 								    cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 								    explicit ggml_backend_cuda_context(int device) :
 								        device(device),
 								        name(GGML_CUDA_NAME + std::to_string(device)) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ~ggml_backend_cuda_context() {
 								        if (copy_event != nullptr) {
 								            CUDA_CHECK(cudaEventDestroy(copy_event));
 								        }
 								        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
 								            for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
 								                if (streams[i][j] != nullptr) {
 								                    CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
 								                }
 								            }
 								            if (cublas_handles[i] != nullptr) {
 								                CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
 								            }
 								        }
 								    }
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cudaStream_t stream(int device, int stream) {
 								        if (streams[device][stream] == nullptr) {
 								            ggml_cuda_set_device(device);
 								            CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
 								        }
 								        return streams[device][stream];
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cudaStream_t stream() {
 								        return stream(device, 0);
 								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cublasHandle_t cublas_handle(int device) {
 								        if (cublas_handles[device] == nullptr) {
 								            ggml_cuda_set_device(device);
 								            CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
 								            CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
 								        }
 								        return cublas_handles[device];
 								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cublasHandle_t cublas_handle() {
 								        return cublas_handle(device);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // pool
 								    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
 								    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device) {
 								#if !defined(GGML_USE_HIPBLAS)
 								        if (get_cuda_global_info().devices[device].vmm) {
 								            return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 06:53:30 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
 								        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool & pool(int device) {
 								        if (pools[device] == nullptr) {
 								            pools[device] = new_pool_for_device(device);
 								        }
 								        return *pools[device];
 								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool & pool() {
 								        return pool(device);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								};
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// cuda buffer
 								struct ggml_backend_cuda_buffer_context {
 								    int device;
 								    void * dev_ptr = nullptr;
 								    std::string name;
 								    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
 								        device(device), dev_ptr(dev_ptr),
 								        name(GGML_CUDA_NAME + std::to_string(device)) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    ~ggml_backend_cuda_buffer_context() {
 								        CUDA_CHECK(cudaFree(dev_ptr));
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								};
 								GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								    return ctx->name.c_str();
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
 								    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								    delete ctx;
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								    return ctx->dev_ptr;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								    if (tensor->view_src != NULL && tensor->view_offs == 0) {
 								        assert(tensor->view_src->buffer->buft == buffer->buft);
 								        tensor->backend = tensor->view_src->backend;
 								        tensor->extra = tensor->view_src->extra;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								        return;
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (ggml_is_quantized(tensor->type)) {
 								        // initialize padding to 0 to avoid possible NaN values
 								        size_t original_size = ggml_nbytes(tensor);
 								        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (padded_size > original_size && tensor->view_src == nullptr) {
 								            ggml_cuda_set_device(ctx->device);
 								            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
 								        }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
 								    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
 								    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
 								    if (ggml_backend_buffer_is_cuda(src->buffer)) {
 								        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
 								        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
 								        if (src_ctx->device == dst_ctx->device) {
 								            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
 								        } else {
 								            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
 								        }
 								        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 								        return true;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return false;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buffer);
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaDeviceSynchronize());
 								    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
 								    CUDA_CHECK(cudaDeviceSynchronize());
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 								    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
 								    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
 								    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
 								    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
 								    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
 								    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
 								    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
 								    /* .clear           = */ ggml_backend_cuda_buffer_clear,
 								    /* .reset           = */ NULL,
 								};
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// cuda buffer type
 								struct ggml_backend_cuda_buffer_type_context {
 								    int device;
 								    std::string name;
 								};
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
 								    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return ctx->name.c_str();
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 								    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(buft_ctx->device);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void * dev_ptr;
 								    cudaError_t err = cudaMalloc(&dev_ptr, size);
 								    if (err != cudaSuccess) {
 								        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
 								        return nullptr;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 								}
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
 								    return 128;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
 								    size_t size = ggml_nbytes(tensor);
 								    int64_t ne0 = tensor->ne[0];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (ggml_is_quantized(tensor->type)) {
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return size;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
 								    if (!ggml_backend_is_cuda(backend)) {
 								        return false;
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return buft_ctx->device == cuda_ctx->device;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
 								    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
 								    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
 								    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
 								    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
 								    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
 								    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
 								    /* .is_host          = */ NULL,
 								};
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
 								    static std::mutex mutex;
 								    std::lock_guard<std::mutex> lock(mutex);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (device >= ggml_backend_cuda_get_device_count()) {
 								        return nullptr;
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static bool ggml_backend_cuda_buffer_type_initialized = false;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (!ggml_backend_cuda_buffer_type_initialized) {
 								        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
 								            ggml_backend_cuda_buffer_types[i] = {
 								                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
 								                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
 								            };
 								        }
 								        ggml_backend_cuda_buffer_type_initialized = true;
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return &ggml_backend_cuda_buffer_types[device];
 								}
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// cuda split buffer
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
 								    int64_t min_compute_capability = INT_MAX;
 								    int64_t max_compute_capability = INT_MIN;
 								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
 								            if (min_compute_capability > get_cuda_global_info().devices[id].cc) {
 								                min_compute_capability = get_cuda_global_info().devices[id].cc;
 								            }
 								            if (max_compute_capability < get_cuda_global_info().devices[id].cc) {
 								                max_compute_capability = get_cuda_global_info().devices[id].cc;
 								            }
 								        }
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    switch(type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
 								        case GGML_TYPE_F16:
 								        case GGML_TYPE_F32:
 								            return 1;
 								        case GGML_TYPE_Q2_K:
 								            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
 								        case GGML_TYPE_Q3_K:
 								            return min_compute_capability < CC_RDNA2 ? 128 : 64;
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
 								        case GGML_TYPE_IQ2_XXS:
 								        case GGML_TYPE_IQ2_XS:
 								        case GGML_TYPE_IQ2_S:
 								        case GGML_TYPE_IQ3_XXS:
 								        case GGML_TYPE_IQ1_S:
 								        case GGML_TYPE_IQ4_NL:
 								        case GGML_TYPE_IQ4_XS:
 								        case GGML_TYPE_IQ3_S:
 								            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
 								        default:
 								            GGML_ASSERT(false);
 								    }
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 08:23:56 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch(type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								            return max_compute_capability >= CC_VOLTA ? 128 : 64;
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								            return 64;
 								        case GGML_TYPE_F16:
 								        case GGML_TYPE_F32:
 								            return 1;
 								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_IQ2_XXS:
 								        case GGML_TYPE_IQ2_XS:
 								        case GGML_TYPE_IQ2_S:
 								        case GGML_TYPE_IQ3_XXS:
 								        case GGML_TYPE_IQ1_S:
 								        case GGML_TYPE_IQ4_NL:
 								        case GGML_TYPE_IQ4_XS:
 								        case GGML_TYPE_IQ3_S:
 								            return max_compute_capability >= CC_VOLTA ? 128 : 64;
 								        case GGML_TYPE_Q6_K:
 								            return 64;
 								        default:
 								            GGML_ASSERT(false);
 								    }
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
 								    const int64_t nrows = ggml_nrows(tensor);
 								    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
 								    *row_low -= *row_low % rounding;
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (id == ggml_backend_cuda_get_device_count() - 1) {
 								        *row_high = nrows;
 								    } else {
 								        *row_high = nrows*tensor_split[id + 1];
 								        *row_high -= *row_high % rounding;
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
 								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct ggml_backend_cuda_split_buffer_type_context {
 								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 								};
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct ggml_backend_cuda_split_buffer_context {
 								    ~ggml_backend_cuda_split_buffer_context() {
 								        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
 								            for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
 								                for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
 								                    if (extra->events[id][is] != nullptr) {
 								                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
 								                    }
 								                }
 								                if (extra->data_device[id] != nullptr) {
 								                    CUDA_CHECK(cudaFree(extra->data_device[id]));
 								                }
 								            }
 								            delete extra;
 								        }
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 								};
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
 								    return GGML_CUDA_NAME "_Split";
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buffer);
 								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
 								    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
 								    GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
 								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 								    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 								    delete ctx;
 								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
 								    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
 								    return (void *)0x1000;
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buffer);
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
 								    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = tensor->ne[0];
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
 								    ctx->tensor_extras.push_back(extra);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // FIXME: do not crash if cudaMalloc fails
 								        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
 								        ggml_cuda_set_device(id);
 								        char * buf;
 								        CUDA_CHECK(cudaMalloc(&buf, size));
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // set padding to 0 to avoid possible NaN values
 								        if (size > original_size) {
 								            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
 								        }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        extra->data_device[id] = buf;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
 								            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
 								        }
 								    }
 								    tensor->extra = extra;
 								}
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 								    // split tensors must always be set in their entirety at once
 								    GGML_ASSERT(offset == 0);
 								    GGML_ASSERT(size == ggml_nbytes(tensor));
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = tensor->ne[0];
 								    const size_t nb1 = tensor->nb[1];
 								    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const size_t offset_split = row_low*nb1;
 								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								        const char * buf_host = (const char *)data + offset_split;
 								        CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
 								    }
 								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 								    // split tensors must always be set in their entirety at once
 								    GGML_ASSERT(offset == 0);
 								    GGML_ASSERT(size == ggml_nbytes(tensor));
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = tensor->ne[0];
 								    const size_t nb1 = tensor->nb[1];
 								    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const size_t offset_split = row_low*nb1;
 								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								        char * buf_host = (char *)data + offset_split;
 								        CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
 								    }
 								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 								    GGML_UNUSED(buffer);
 								    GGML_UNUSED(value);
 								}
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 								    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
 								    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
 								    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
 								    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
 								    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
 								    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
 								    /* .cpy_tensor      = */ NULL,
 								    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
 								    /* .reset           = */ NULL,
 								};
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// cuda split buffer type
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
 								    return GGML_CUDA_NAME "_Split";
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
 								}
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 								    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
 								    // instead, we allocate them for each tensor separately in init_tensor
 								    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
 								    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
 								    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
 								    return 128;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
 								}
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
 								    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    size_t total_size = 0;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = tensor->ne[0];
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        total_size += ggml_nbytes_split(tensor, nrows_split);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return total_size;
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
 								    return ggml_backend_is_cuda(backend);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
 								    return false;
 								    GGML_UNUSED(buft);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
 								    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
 								    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
 								    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
 								    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
 								    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
 								    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
 								    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 								};
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
 								    static std::mutex mutex;
 								    std::lock_guard<std::mutex> lock(mutex);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
 								    if (all_zero) {
 								        tensor_split_arr = get_cuda_global_info().default_tensor_split;
 								    } else {
 								        float split_sum = 0.0f;
 								        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
 								            tensor_split_arr[i] = split_sum;
 								            split_sum += tensor_split[i];
 								        }
 								        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
 								            tensor_split_arr[i] /= split_sum;
 								        }
 								    }
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    auto it = buft_map.find(tensor_split_arr);
 								    if (it != buft_map.end()) {
 								        return &it->second;
 								    }
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    struct ggml_backend_buffer_type buft {
 								        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
 								        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
 								    };
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    auto result = buft_map.emplace(tensor_split_arr, buft);
 								    return &result.first->second;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// host buffer type
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
 								    return GGML_CUDA_NAME "_Host";
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buft);
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
 								    return GGML_CUDA_NAME "_Host";
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(buffer);
 								}
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 								    CUDA_CHECK(cudaFreeHost(buffer->context));
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void * ggml_cuda_host_malloc(size_t size) {
 								    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
 								        return nullptr;
 								    }
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void * ptr = nullptr;
 								    cudaError_t err = cudaMallocHost((void **) &ptr, size);
 								    if (err != cudaSuccess) {
 								        // clear the error
 								        cudaGetLastError();
 								        fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
 								            size/1024.0/1024.0, cudaGetErrorString(err));
 								        return nullptr;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return ptr;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 								    void * ptr = ggml_cuda_host_malloc(size);
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (ptr == nullptr) {
 								        // fallback to cpu buffer
 								        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 06:51:49 +00:00
+								    }
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
 								    buffer->buft = buft;
 								    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
 								    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
 								    return buffer;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
 								    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
 								        /* .iface    = */ {
 								            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
 								            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
 								            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
 								            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
 								            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 								            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
 								            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
 								        },
 								        /* .context  = */ nullptr,
 								    };
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return &ggml_backend_cuda_buffer_type_host;
 								}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
 								//    return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
 								//}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								/// kernels
 								#if defined(GGML_USE_HIPBLAS)
 								#define __CUDA_ARCH__ 1300
 								#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
 								    defined(__gfx1150__) || defined(__gfx1151__)
 								#define RDNA3
 								#endif
 								#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
 								    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
 								#define RDNA2
 								#endif
 								#ifndef __has_builtin
 								    #define __has_builtin(x) 0
 								#endif
 								typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 								typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 								static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
 								    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
 								    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 								#if __has_builtin(__builtin_elementwise_sub_sat)
 								    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
 								    return reinterpret_cast<const int &>(c);
 								#else
 								    int8x4_t c;
 								    int16_t tmp;
 								#pragma unroll
 								    for (int i = 0; i < 4; i++) {
 								        tmp = va[i] - vb[i];
 								        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
 								        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
 								        c[i] = tmp;
 								    }
 								    return reinterpret_cast<int &>(c);
 								#endif // __has_builtin(__builtin_elementwise_sub_sat)
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								}
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int __vsub4(const int a, const int b) {
 								    return __vsubss4(a, b);
 								}
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
 								    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
 								    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
 								    unsigned int c;
 								    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 								#pragma unroll
 								    for (int i = 0; i < 4; ++i) {
 								        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return c;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 								#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
 								    c = __builtin_amdgcn_sdot4(a, b, c, false);
 								#elif defined(RDNA3)
 								    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 								#elif defined(__gfx1010__) || defined(__gfx900__)
 								    int tmp1;
 								    int tmp2;
 								    asm("\n \
 								        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
 								        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
 								        v_add3_u32 %0, %1, %2, %0 \n \
 								        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
 								        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
 								        v_add3_u32 %0, %1, %2, %0 \n \
 								        "
 								        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
 								        : "v"(a), "v"(b)
 								    );
 								#else
 								    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
 								    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 								    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
 								#endif
 								    return c;
 								}
 								#endif // defined(GGML_USE_HIPBLAS)
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								typedef half dfloat; // dequantize float
 								typedef half2 dfloat2;
 								#else
 								typedef float dfloat; // dequantize float
 								typedef float2 dfloat2;
 								#endif //GGML_CUDA_F16
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
 								    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int x32 = 0;
 								    x32 |= x16[0] <<  0;
 								    x32 |= x16[1] << 16;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return x32;
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
 								    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int x32 = 0;
 								    x32 |= x16[0] <<  0;
 								    x32 |= x16[1] << 16;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return x32;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
 								    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
 								    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T>
 								using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
 								typedef to_t_cuda_t<float> to_fp32_cuda_t;
 								typedef to_t_cuda_t<half> to_fp16_cuda_t;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
 								typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 								typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								typedef void (*ggml_cuda_func_t)(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								typedef void (*ggml_cuda_op_mul_mat_t)(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 								    const int64_t src1_padded_row_size, cudaStream_t stream);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								typedef void (*ggml_cuda_op_flatten_t)(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 								typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 								typedef void (*load_tiles_cuda_t)(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
 								typedef float (*vec_dot_q_mul_mat_cuda_t)(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define WARP_SIZE 32
 								#define CUDA_GELU_BLOCK_SIZE 256
 								#define CUDA_SILU_BLOCK_SIZE 256
 								#define CUDA_TANH_BLOCK_SIZE 256
 								#define CUDA_RELU_BLOCK_SIZE 256
 								#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
 								#define CUDA_HARDSWISH_BLOCK_SIZE 256
 								#define CUDA_SQR_BLOCK_SIZE 256
 								#define CUDA_CPY_BLOCK_SIZE 32
 								#define CUDA_SCALE_BLOCK_SIZE 256
 								#define CUDA_CLAMP_BLOCK_SIZE 256
 								#define CUDA_ROPE_BLOCK_SIZE 256
 								#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
 								#define CUDA_ALIBI_BLOCK_SIZE 32
 								#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 								#define CUDA_QUANTIZE_BLOCK_SIZE 256
 								#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 								#define CUDA_GET_ROWS_BLOCK_SIZE 256
 								#define CUDA_UPSCALE_BLOCK_SIZE 256
 								#define CUDA_CONCAT_BLOCK_SIZE 256
 								#define CUDA_PAD_BLOCK_SIZE 256
 								#define CUDA_ARANGE_BLOCK_SIZE 256
 								#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 								#define CUDA_ACC_BLOCK_SIZE 256
 								#define CUDA_IM2COL_BLOCK_SIZE 256
 								#define CUDA_POOL2D_BLOCK_SIZE 256
 								#define CUDA_Q8_0_NE_ALIGN 2048
 								// dmmv = dequantize_mul_mat_vec
 								#ifndef GGML_CUDA_DMMV_X
 								#define GGML_CUDA_DMMV_X 32
 								#endif
 								#ifndef GGML_CUDA_MMV_Y
 								#define GGML_CUDA_MMV_Y 1
 								#endif
 								#ifndef K_QUANTS_PER_ITERATION
 								#define K_QUANTS_PER_ITERATION 2
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
 								#endif
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
 								#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
 								#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define MUL_MAT_SRC1_COL_STRIDE 128
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								[[noreturn]]
 								static __device__ void no_device_code(
 								    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
 								           file_name, line, function_name, arch);
 								    GGML_UNUSED(arch_list);
 								#else
 								    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
 								           file_name, line, function_name, arch, arch_list);
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    __trap();
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(no_device_code); // suppress unused function warning
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef __CUDA_ARCH__
 								#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 								#else
 								//#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
 								#define NO_DEVICE_CODE
 								#endif // __CUDA_ARCH__
 								static __device__ __forceinline__ float warp_reduce_sum(float x) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return x;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
 								        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
 								    }
 								    return a;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 								#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 								#pragma unroll
 								   for (int mask = 16; mask > 0; mask >>= 1) {
 								       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
 								   }
 								   return a;
 								#else
 								   GGML_UNUSED(a);
 								   NO_DEVICE_CODE;
 								#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 								}
 								#endif // GGML_CUDA_F16
 								static __device__ __forceinline__ float warp_reduce_max(float x) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 								//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 								//#pragma unroll
 								//    for (int mask = 16; mask > 0; mask >>= 1) {
 								//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
 								//    }
 								//    return x;
 								//#else
 								//    GGML_UNUSED(x);
 								//    NO_DEVICE_CODE;
 								//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 								//}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float op_repeat(const float a, const float b) {
 								    return b;
 								    GGML_UNUSED(a);
 								}
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 15:14:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float op_add(const float a, const float b) {
 								    return a + b;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float op_mul(const float a, const float b) {
 								    return a * b;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float op_div(const float a, const float b) {
 								    return a / b;
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
 								static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
 								        int ne0, int ne1, int ne2, int ne3,
 								        int ne10, int ne11, int ne12, int ne13,
 								        /*int s0, */ int s1,  int s2,  int s3,
 								        /*int s10,*/ int s11, int s12, int s13) {
 								    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
 								    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
 								    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
 								        return;
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i11 = i1 % ne11;
 								    const int i12 = i2 % ne12;
 								    const int i13 = i3 % ne13;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
 								    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
 								    const size_t i_dst  = i_src0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const src0_t * src0_row = src0 + i_src0;
 								    const src1_t * src1_row = src1 + i_src1;
 								    dst_t * dst_row = dst + i_dst;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
 								        const int i10 = i0 % ne10;
 								        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 								    }
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
 								static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
 								        int ne0, int ne1, int ne2, int ne3,
 								        int ne10, int ne11, int ne12, int ne13,
 								        /*int s0, */ int s1,  int s2,  int s3,
 								        /*int s10,*/ int s11, int s12, int s13) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i3 = i/(ne2*ne1*ne0);
 								    const int i2 = (i/(ne1*ne0)) % ne2;
 								    const int i1 = (i/ne0) % ne1;
 								    const int i0 = i % ne0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
 								        return;
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i11 = i1 % ne11;
 								    const int i12 = i2 % ne12;
 								    const int i13 = i3 % ne13;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
 								    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
 								    const size_t i_dst  = i_src0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const src0_t * src0_row = src0 + i_src0;
 								    const src1_t * src1_row = src1 + i_src1;
 								    dst_t * dst_row = dst + i_dst;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i10 = i0 % ne10;
 								    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
 								    const int ne10, const int ne11, const int ne12,
 								    const int nb1, const int nb2, int offset) {
 								    const int i = blockDim.x * blockIdx.x + threadIdx.x;
 								    if (i >= ne) {
 								        return;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int src1_idx = i - offset;
 								    int oz = src1_idx / nb2;
 								    int oy = (src1_idx - (oz * nb2)) / nb1;
 								    int ox = src1_idx % nb1;
 								    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
 								        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
 								    } else {
 								        dst[i] = x[i];
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void gelu_f32(const float * x, float * dst, const int k) {
 								    const float GELU_COEF_A    = 0.044715f;
 								    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float xi = x[i];
 								    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void silu_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] / (1.0f + expf(-x[i]));
 								}
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 15:14:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
 								    const float GELU_QUICK_COEF = -1.702f;
 								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void tanh_f32(const float * x, float * dst, int k) {
 								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = tanhf(x[i]);
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void relu_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fmaxf(x[i], 0);
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
 								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void sqr_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * x[i];
 								}
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int block_size>
 								static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
 								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    const int tid = threadIdx.x;
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float2 mean_var = make_float2(0.f, 0.f);
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col = tid; col < ncols; col += block_size) {
 								        const float xi = x[row*ncols + col];
 								        mean_var.x += xi;
 								        mean_var.y += xi * xi;
 								    }
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums
 								    mean_var = warp_reduce_sum(mean_var);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float2 s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = mean_var;
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        __syncthreads();
 								        mean_var = s_sum[lane_id];
 								        mean_var = warp_reduce_sum(mean_var);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float mean = mean_var.x / ncols;
 								    const float var = mean_var.y / ncols - mean * mean;
 								    const float inv_std = rsqrtf(var + eps);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col = tid; col < ncols; col += block_size) {
 								        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    // operation
 								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
 								    if (blockIdx.z < ne02) { // src0
 								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne0 +
 								            blockIdx.z * ne0 * gridDim.y;
 								        dst[offset_dst] = x[offset_src];
 								    } else {
 								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne0 +
 								            (blockIdx.z - ne02) * ne0 *  gridDim.y;
 								        dst[offset_dst] = y[offset_src];
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
 								    // blockIdx.z: idx of ne02*ne03
 								    // blockIdx.y: idx of ne01*scale_factor， aka ne1
 								    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
 								    // ne00xne01: ne00 * ne01
 								    int ne0 = ne00 * scale_factor;
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    // operation
 								    int i00 = nidx / scale_factor;
 								    int i01 = blockIdx.y / scale_factor;
 								    int offset_src =
 								        i00 +
 								        i01 * ne00 +
 								        blockIdx.z * ne00xne01;
 								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
 								    dst[offset_dst] = x[offset_src];
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
 								    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
 								    // blockIdx.y: idx of ne1
 								    // blockIDx.x: idx of ne0 / BLOCK_SIZE
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // operation
 								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
 								    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
 								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne00 +
 								            blockIdx.z * ne00 * ne01;
 								        dst[offset_dst] = x[offset_src];
 								    } else {
 								        dst[offset_dst] = 0.0f;
 								    }
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
 								    // blockIDx.x: idx of ne0 / BLOCK_SIZE
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    dst[nidx] = start + step * nidx;
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
 								    // blockIDx.y: idx of timesteps->ne[0]
 								    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
 								    int i = blockIdx.y;
 								    int j = threadIdx.x + blockIdx.x * blockDim.x;
 								    float * embed_data = (float *)((char *)dst +  i*nb1);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
 								        embed_data[dim] = 0.f;
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int half = dim / 2;
 								    if (j >= half) {
 								        return;
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float timestep = timesteps[i];
 								    float freq = (float)expf(-logf(max_period) * j / half);
 								    float arg = timestep * freq;
 								    embed_data[j] = cosf(arg);
 								    embed_data[j + half] = sinf(arg);
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int block_size>
 								static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
 								    // blockIdx.x: num_groups idx
 								    // threadIdx.x: block_size idx
 								    int start = blockIdx.x * group_size;
 								    int end = start + group_size;
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 21:19:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    start += threadIdx.x;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (end >= ne_elements) {
 								        end = ne_elements;
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0.0f; // partial sum for thread in warp
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int j = start; j < end; j += block_size) {
 								        tmp += x[j];
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float mean = tmp / group_size;
 								    tmp = 0.0f;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int j = start; j < end; j += block_size) {
 								        float xi = x[j] - mean;
 								        dst[j] = xi;
 								        tmp += xi * xi;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 13:22:45 +00:00
+								    tmp = warp_reduce_sum(tmp);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
 								        }
 								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float variance = tmp / group_size;
 								    float scale = rsqrtf(variance + eps);
 								    for (int j = start; j < end; j += block_size) {
 								        dst[j] *= scale;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int block_size>
 								static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 11:13:09 +00:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0.0f; // partial sum for thread in warp
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col = tid; col < ncols; col += block_size) {
 								        const float xi = x[row*ncols + col];
 								        tmp += xi * xi;
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums
 								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
 								        }
 								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
 								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float mean = tmp / ncols;
 								    const float scale = rsqrtf(mean + eps);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col = tid; col < ncols; col += block_size) {
 								        dst[row*ncols + col] = scale * x[row*ncols + col];
 								    }
 								}
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const block_q4_0 * x = (const block_q4_0 *) vx;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dfloat d = x[ib].d;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int vui = x[ib].qs[iqs];
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = vui & 0xF;
 								    v.y = vui >> 4;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    v = __hsub2(v, {8.0f, 8.0f});
 								    v = __hmul2(v, {d, d});
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = (v.x - 8.0f) * d;
 								    v.y = (v.y - 8.0f) * d;
 								#endif // GGML_CUDA_F16
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const block_q4_1 * x = (const block_q4_1 *) vx;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dfloat d = __low2half(x[ib].dm);
 								    const dfloat m = __high2half(x[ib].dm);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int vui = x[ib].qs[iqs];
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = vui & 0xF;
 								    v.y = vui >> 4;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    v = __hmul2(v, {d, d});
 								    v = __hadd2(v, {m, m});
 								#else
 								    v.x = (v.x * d) + m;
 								    v.y = (v.y * d) + m;
 								#endif // GGML_CUDA_F16
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const block_q5_0 * x = (const block_q5_0 *) vx;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dfloat d = x[ib].d;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint32_t qh;
 								    memcpy(&qh, x[ib].qh, sizeof(qh));
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
 								    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 16:43:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
 								    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 17:08:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    v = __hsub2(v, {16.0f, 16.0f});
 								    v = __hmul2(v, {d, d});
 								#else
 								    v.x = (v.x - 16.0f) * d;
 								    v.y = (v.y - 16.0f) * d;
 								#endif // GGML_CUDA_F16
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const block_q5_1 * x = (const block_q5_1 *) vx;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dfloat d = __low2half(x[ib].dm);
 								    const dfloat m = __high2half(x[ib].dm);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint32_t qh;
 								    memcpy(&qh, x[ib].qh, sizeof(qh));
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
 								    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
 								    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    v = __hmul2(v, {d, d});
 								    v = __hadd2(v, {m, m});
 								#else
 								    v.x = (v.x * d) + m;
 								    v.y = (v.y * d) + m;
 								#endif // GGML_CUDA_F16
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const block_q8_0 * x = (const block_q8_0 *) vx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dfloat d = x[ib].d;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    v.x = x[ib].qs[iqs + 0];
 								    v.y = x[ib].qs[iqs + 1];
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    v = __hmul2(v, {d, d});
 								#else
 								    v.x *= d;
 								    v.y *= d;
 								#endif // GGML_CUDA_F16
 								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int ib = 8*i + ir;
 								    if (ib >= nb32) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								        return;
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + 256*i + 32*ir + 4*il;
 								    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
 								    const float d = __half2float(x->d);
 								    const float dm = -8*d;
 								    const uint8_t * q = x->qs + 4*il;
 								    for (int l = 0; l < 4; ++l) {
 								        y[l+ 0] = d * (q[l] & 0xF) + dm;
 								        y[l+16] = d * (q[l] >>  4) + dm;
 								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int ib = 8*i + ir;
 								    if (ib >= nb32) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        return;
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + 256*i + 32*ir + 4*il;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
 								    const float2 d = __half22float2(x->dm);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * q = x->qs + 4*il;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int l = 0; l < 4; ++l) {
 								        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
 								        y[l+16] = d.x * (q[l] >>  4) + d.y;
 								    }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								//================================== k-quants
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_q2_K * x = (const block_q2_K *) vx;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int n   = tid/32;
 								    const int l   = tid - 32*n;
 								    const int is  = 8*n + l/16;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t q = x[i].qs[32*n + l];
 								    dst_t * y = yy + i*QK_K + 128*n;
 								    float dall = __low2half(x[i].dm);
 								    float dmin = __high2half(x[i].dm);
 								    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
 								    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
 								    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
 								    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 								#else
 								    const int is = tid/16;  // 0 or 1
 								    const int il = tid%16;  // 0...15
 								    const uint8_t q = x[i].qs[il] >> (2*is);
 								    dst_t * y = yy + i*QK_K + 16*is + il;
 								    float dall = __low2half(x[i].dm);
 								    float dmin = __high2half(x[i].dm);
 								    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
 								    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 								#endif
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												ggml : remove bit shuffling (#1405)

* ggml : remove Q4_0 bit shufling (ARM NEON)

* ggml : remove Q4_1 bit shuffling (ARM NEON + reference)

* ggml : nibbles_from_floats() + bytes_from_nibbles() (ARM NEON)

* ggml : remove Q4_2 bit shuffling (WIP, BROKEN)

* ggml : remove Q5_0 bit shuffling (ARM NEON)

* ggml : 2x faster scalar implementations

* ggml : remove Q5_1 bit shuffling (ARM NEON + scalar)

* ggml : simplify scalar dot

* ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit

* ggml : fix Q4_1 quantization

* ggml : update cuBLAS + normalize variable names

* ggml : remove Q4_2 mode

* ggml : minor formatting

* ggml : fix Q5_0 quantization

* scripts : add script for measuring the time per token

* AVX implementations (#1370)

* ggml : uniform 5th bit extraction

* llama : produce error upon loading old model files

* llama : fix model magic/version write

* ggml : speed-up Q5_0 + Q5_1 at 4 threads

* ggml : preserve old Q4 and Q5 formats

* ggml : simplify Q8_1 - no need for low / high sums anymore

* ggml : fix Q8_0 and Q8_1 rounding

* Revert "AVX implementations (#1370)"

This reverts commit 948d124837f9d287d8490f41338e0e4cceb0814f.

* ggml : fix AVX2 implementation

* sha : update hashes for 7B and 13B

* readme : update timings + remove warning banner

* llama : update v2 PR number to 1405

* ggml : fix WASM comments

* ggml : back to original bit order

* readme : add note that Q4 and Q5 have been changed

* llama : fix return for unknown version

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
											
										
										
											2023-05-11 21:23:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
 								    const block_q3_K * x = (const block_q3_K *) vx;
-												ggml : add Q5_0 and Q5_1 quantization (#1187)

* ggml : add Q5_0 quantization (cuBLAS only)

* ggml : fix Q5_0 qh -> uint32_t

* ggml : fix q5_0 histogram stats

* ggml : q5_0 scalar dot product

* ggml : q5_0 ARM NEON dot

* ggml : q5_0 more efficient ARM NEON using uint64_t masks

* ggml : rename Q5_0 -> Q5_1

* ggml : adding Q5_0 mode

* quantize : add Q5_0 and Q5_1 to map

* ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195)

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
											
										
										
											2023-04-26 20:14:13 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    const int r = threadIdx.x/4;
 								    const int tid = r/2;
 								    const int is0 = r%2;
 								    const int l0 = 16*is0 + 4*(threadIdx.x%4);
 								    const int n = tid / 4;
 								    const int j = tid - 4*n;
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 20:40:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint8_t m = 1 << (4*n + j);
 								    int is = 8*n + 2*j + is0;
 								    int shift = 2*j;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 08:23:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
 								                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
 								                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
 								                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
 								    float d_all = x[i].d;
 								    float dl = d_all * (us - 32);
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 20:40:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 128*n + 32*j;
 								    const uint8_t * q = x[i].qs + 32*n;
 								    const uint8_t * hm = x[i].hmask;
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 16:24:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 								#else
 								    const int tid = threadIdx.x;
 								    const int is  = tid/16;  // 0 or 1
 								    const int il  = tid%16;  // 0...15
 								    const int im  = il/8;    // 0...1
 								    const int in  = il%8;    // 0...7
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 16:24:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 16*is + il;
 								    const uint8_t q = x[i].qs[il] >> (2*is);
 								    const uint8_t h = x[i].hmask[in] >> (2*is + im);
 								    const float   d = (float)x[i].d;
 								    if (is == 0) {
 								        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
 								        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
 								    } else {
 								        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
 								        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
 								    }
 								#endif
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 16:24:08 +00:00
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
 								    if (j < 4) {
 								        d = q[j] & 63; m = q[j + 4] & 63;
 								    } else {
 								        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
 								        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
 								    }
 								}
 								#endif
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const block_q4_K * x = (const block_q4_K *) vx;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int is  = 2*il;
 								    const int n   = 4;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float dall = __low2half(x[i].dm);
 								    const float dmin = __high2half(x[i].dm);
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * q = x[i].qs + 32*il + n*ir;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint8_t sc, m;
 								    get_scale_min_k4(is + 0, x[i].scales, sc, m);
 								    const float d1 = dall * sc; const float m1 = dmin * m;
 								    get_scale_min_k4(is + 1, x[i].scales, sc, m);
 								    const float d2 = dall * sc; const float m2 = dmin * m;
 								    for (int l = 0; l < n; ++l) {
 								        y[l + 0] = d1 * (q[l] & 0xF) - m1;
 								        y[l +32] = d2 * (q[l] >>  4) - m2;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
+								    }
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								    const uint8_t * q = x[i].qs;
 								    dst_t * y = yy + i*QK_K;
 								    const float d = (float)x[i].dm[0];
 								    const float m = (float)x[i].dm[1];
 								    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
 								    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
 								#endif
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const block_q5_K * x = (const block_q5_K *) vx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    // assume 64 threads - this is very slightly better than the one below
 								    const int tid = threadIdx.x;
 								    const int il  = tid/16;   // il is in 0...3
 								    const int ir  = tid%16;   // ir is in 0...15
 								    const int is  = 2*il;     // is is in 0...6
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float dall = __low2half(x[i].dm);
 								    const float dmin = __high2half(x[i].dm);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
 								    const uint8_t * qh = x[i].qh + 2*ir;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint8_t sc, m;
 								    get_scale_min_k4(is + 0, x[i].scales, sc, m);
 								    const float d1 = dall * sc; const float m1 = dmin * m;
 								    get_scale_min_k4(is + 1, x[i].scales, sc, m);
 								    const float d2 = dall * sc; const float m2 = dmin * m;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint8_t   hm  = 1 << (2*il);
 								    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
 								    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
 								    hm <<= 1;
 								    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
 								    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								    const uint8_t q = x[i].qs[tid];
 								    const int im = tid/8;  // 0...3
 								    const int in = tid%8;  // 0...7
 								    const int is = tid/16; // 0 or 1
 								    const uint8_t h = x[i].qh[in] >> im;
 								    const float d = x[i].d;
 								    dst_t * y = yy + i*QK_K + tid;
 								    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
 								    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const block_q6_K * x = (const block_q6_K *) vx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i = blockIdx.x;
 								#if QK_K == 256
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // assume 64 threads - this is very slightly better than the one below
 								    const int tid = threadIdx.x;
 								    const int ip  = tid/32;   // ip is 0 or 1
 								    const int il  = tid - 32*ip; // 0...32
 								    const int is  = 8*ip + il/16;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 128*ip + il;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = x[i].d;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * ql = x[i].ql + 64*ip + il;
 								    const uint8_t   qh = x[i].qh[32*ip + il];
 								    const int8_t  * sc = x[i].scales + is;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
 								    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
 								    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
 								    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int ip  = tid/16;         // 0 or 1
 								    const int il  = tid - 16*ip;    // 0...15
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * y = yy + i*QK_K + 16*ip + il;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = x[i].d;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t   ql = x[i].ql[16*ip + il];
 								    const uint8_t   qh = x[i].qh[il] >> (2*ip);
 								    const int8_t  * sc = x[i].scales;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
 								    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
 								#endif
 								}
 								inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
 								    switch (type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
 								            return true;
 								        default:
 								            return false;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint16_t * q2 = x[i].qs + 4*ib;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
 								    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
 								    const uint32_t aux32 = q2[2] | (q2[3] << 16);
 								    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
 								    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    assert(false);
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq2_xs * x = (const block_iq2_xs *) vx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint16_t * q2 = x[i].qs + 4*ib;
 								    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
 								    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
 								    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 								#else
 								    assert(false);
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq2_s * x = (const block_iq2_s *) vx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
 								    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
 								    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    assert(false);
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t  * q3 = x[i].qs + 8*ib;
 								    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
 								    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
 								    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
 								    const uint32_t aux32 = gas[0] | (gas[1] << 16);
 								    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
 								    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
 								        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    assert(false);
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq3_s * x = (const block_iq3_s *) vx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t * qs = x[i].qs + 8*ib;
 								    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
 								    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
 								    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
 								    const uint8_t signs = x[i].signs[4*ib + il];
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
 								        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    assert(false);
 								#endif
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq1_s * x = (const block_iq1_s  *) vx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
 								    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
 								    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
 								    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
 								    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
 								    grid32[0] &= 0x0f0f0f0f;
 								    for (int j = 0; j < 8; ++j) {
 								        y[j] = d * (q[j] + delta);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								    }
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    assert(false);
 								#endif
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i   = blockIdx.x;
 								    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
 								    const uint8_t  * q4 = x[ib].qs + 4*il;
 								    const float d = (float)x[ib].d;
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
 								        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
 								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K != 64
 								template<typename dst_t>
 								static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq4_xs * x = (const block_iq4_xs *)vx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
 								    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
 								    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
 								        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    if (row > nrows) return;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = 16/K_QUANTS_PER_ITERATION;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0...15 or 0...7
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
 								    const int q_offset = 32*im + l0;
 								    const int s_offset = 8*im;
 								    const int y_offset = 128*im + l0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint32_t aux[4];
 								    const uint8_t * d = (const uint8_t *)aux;
 								    const uint8_t * m = (const uint8_t *)(aux + 2);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y = yy + i * QK_K + y_offset;
 								        const uint8_t * q = x[i].qs + q_offset;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
 								        aux[0] = a[0] & 0x0f0f0f0f;
 								        aux[1] = a[1] & 0x0f0f0f0f;
 								        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
 								        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
 								        float sum1 = 0, sum2 = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
 								                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
 								                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
 								                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
 								                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
 								                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
 								                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
 								                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
 								            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
 								                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
 								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += dall * sum1 - dmin * sum2;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
 								    }
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
 								    const int offset = tid * K_QUANTS_PER_ITERATION;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint32_t uaux[2];
 								    const uint8_t * d = (const uint8_t *)uaux;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y = yy + i * QK_K + offset;
 								        const uint8_t * q = x[i].qs + offset;
 								        const uint32_t * s = (const uint32_t *)x[i].scales;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        uaux[0] = s[0] & 0x0f0f0f0f;
 								        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float2 dall = __half22float2(x[i].dm);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float sum1 = 0, sum2 = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            const uint8_t ql = q[l];
 								            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
 								                  + y[l+16] * d[1] * ((ql >> 2) & 3)
 								                  + y[l+32] * d[2] * ((ql >> 4) & 3)
 								                  + y[l+48] * d[3] * ((ql >> 6) & 3);
 								            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
 								        }
 								        tmp += dall.x * sum1 - dall.y * sum2;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (threadIdx.x == 0) {
 								        dst[row] = tmp;
 								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    if (row > nrows) return;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint16_t kmask1 = 0x0303;
 								    const uint16_t kmask2 = 0x0f0f;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
 								    const int step = 16/K_QUANTS_PER_ITERATION;
 								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0....15 or 0...7
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t m = 1 << (4*im);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
 								    const int q_offset =  32*im + l0;
 								    const int y_offset = 128*im + l0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t utmp[4];
 								    const int8_t * s = (const int8_t *)utmp;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint16_t s_shift = 4*im;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y  = yy + i * QK_K + y_offset;
 								        const uint8_t * q = x[i].qs + q_offset;
 								        const uint8_t * h = x[i].hmask + l0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
 								        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
 								        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
 								        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float d = x[i].d;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float sum = 0;
 								        for (int l = 0; l < n; ++l) {
 								            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
 								                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
 								                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
 								                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
 								            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
 								                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
 								                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
 								                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
 								        }
 								        tmp += d * sum;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
 								    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
 								    const int in = offset/8;                                 // 0 or 1
 								    const int im = offset%8;                                 // 0...7
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y = yy + i * QK_K + offset;
 								        const uint8_t * q = x[i].qs + offset;
 								        const uint8_t * s = x[i].scales;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float dall = (float)x[i].d;
 								        float sum = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            const uint8_t hl = x[i].hmask[im+l] >> in;
 								            const uint8_t ql = q[l];
 								            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
 								                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
 								                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
 								                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += sum;
 								    }
 								#endif
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (threadIdx.x == 0) {
 								        dst[row] = tmp;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    if (row > nrows) return;
 								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 08:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    const uint16_t kmask1 = 0x3f3f;
 								    const uint16_t kmask2 = 0x0f0f;
 								    const uint16_t kmask3 = 0xc0c0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int il  = tid/step;                            // 0...3
 								    const int ir  = tid - step*il;                       // 0...7 or 0...3
 								    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
 								    const int in = il%2;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int l0 = n*(2*ir + in);
 								    const int q_offset = 32*im + l0;
 								    const int y_offset = 64*im + l0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t aux[4];
 								    const uint8_t * sc = (const uint8_t *)aux;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if K_QUANTS_PER_ITERATION == 2
 								    uint32_t q32[4];
 								    const uint8_t * q4 = (const uint8_t *)q32;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t q16[4];
 								    const uint8_t * q4 = (const uint8_t *)q16;
 								#endif
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y1 = yy + i*QK_K + y_offset;
 								        const float   * y2 = y1 + 128;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux[0] = a[im+0] & kmask1;
 								        aux[1] = a[im+2] & kmask1;
 								        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
 								        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if K_QUANTS_PER_ITERATION == 2
 								        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
 								        const uint32_t * q2 = q1 + 16;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        q32[0] = q1[0] & 0x0f0f0f0f;
 								        q32[1] = q1[0] & 0xf0f0f0f0;
 								        q32[2] = q2[0] & 0x0f0f0f0f;
 								        q32[3] = q2[0] & 0xf0f0f0f0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float4 s = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
 								        for (int l = 0; l < 4; ++l) {
 								            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
 								            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
 								            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
 								        }
 								        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
 								        const uint16_t * q2 = q1 + 32;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        q16[0] = q1[0] & 0x0f0f;
 								        q16[1] = q1[0] & 0xf0f0;
 								        q16[2] = q2[0] & 0x0f0f;
 								        q16[3] = q2[0] & 0xf0f0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float4 s = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
 								        for (int l = 0; l < 2; ++l) {
 								            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
 								            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
 								            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
 								        }
 								        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 								#endif
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = tid * K_QUANTS_PER_ITERATION;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t aux16[2];
 								    const uint8_t * s = (const uint8_t *)aux16;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const uint8_t * q = x[i].qs + step;
 								        const float   * y = yy + i*QK_K + step;
 								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux16[0] = a[0] & 0x0f0f;
 								        aux16[1] = (a[0] >> 4) & 0x0f0f;
 								        const float d = (float)x[i].dm[0];
 								        const float m = (float)x[i].dm[1];
 								        float sum = 0.f;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
 								                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
 								                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
 								                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += sum;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (tid == 0) {
 								        dst[row] = tmp;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockIdx.x;
 								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								    const uint16_t kmask1 = 0x3f3f;
 								    const uint16_t kmask2 = 0x0f0f;
 								    const uint16_t kmask3 = 0xc0c0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/2;  // 0...15
 								    const int ix  = threadIdx.x%2;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int il  = tid/4;     // 0...3
 								    const int ir  = tid - 4*il;// 0...3
 								    const int n   = 2;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
 								    const int in = il%2;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int l0 = n*(2*ir + in);
 								    const int q_offset = 32*im + l0;
 								    const int y_offset = 64*im + l0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t hm1  = 1 << (2*im);
 								    const uint8_t hm2  = hm1 << 4;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t aux[4];
 								    const uint8_t * sc = (const uint8_t *)aux;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t q16[8];
 								    const uint8_t * q4 = (const uint8_t *)q16;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint8_t * ql1 = x[i].qs + q_offset;
 								        const uint8_t * qh  = x[i].qh + l0;
 								        const float   * y1  = yy + i*QK_K + y_offset;
 								        const float   * y2  = y1 + 128;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux[0] = a[im+0] & kmask1;
 								        aux[1] = a[im+2] & kmask1;
 								        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
 								        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float4 sum = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
 								        const uint16_t * q1 = (const uint16_t *)ql1;
 								        const uint16_t * q2 = q1 + 32;
 								        q16[0] = q1[0] & 0x0f0f;
 								        q16[1] = q1[8] & 0x0f0f;
 								        q16[2] = (q1[0] >> 4) & 0x0f0f;
 								        q16[3] = (q1[8] >> 4) & 0x0f0f;
 								        q16[4] = q2[0] & 0x0f0f;
 								        q16[5] = q2[8] & 0x0f0f;
 								        q16[6] = (q2[0] >> 4) & 0x0f0f;
 								        q16[7] = (q2[8] >> 4) & 0x0f0f;
 								        for (int l = 0; l < n; ++l) {
 								            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
 								                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
 								            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
 								                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
 								            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
 								                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
 								            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
 								                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
 								            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
 								                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
 								    const int step = tid * K_QUANTS_PER_ITERATION;
 								    const int im = step/8;
 								    const int in = step%8;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const uint8_t * q = x[i].qs + step;
 								        const int8_t  * s = x[i].scales;
 								        const float   * y = yy + i*QK_K + step;
 								        const float     d = x[i].d;
 								        float sum = 0.f;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            const uint8_t h = x[i].qh[in+j] >> im;
 								            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
 								                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
 								                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
 								                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += sum;
 								    }
 								#endif
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (threadIdx.x == 0) {
 								        dst[row] = tmp;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    if (row > nrows) return;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0...15 or 0...7
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if K_QUANTS_PER_ITERATION == 1
 								    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
 								    const int is = 0;
 								#else
 								    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
 								    const int is = in / 4;
 								#endif
 								    const int ql_offset = 64*im + l0;
 								    const int qh_offset = 32*im + l0;
 								    const int s_offset  =  8*im + is;
 								    const int y_offset = 128*im + l0;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y  = yy + i * QK_K + y_offset;
 								        const uint8_t * ql = x[i].ql + ql_offset;
 								        const uint8_t * qh = x[i].qh + qh_offset;
 								        const int8_t  * s  = x[i].scales + s_offset;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float d = x[i].d;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if K_QUANTS_PER_ITERATION == 1
 								        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
 								                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
 								                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
 								                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
 								                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
 								                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
 								                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
 								                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
 								        tmp += sum;
 								#else
 								        float sum = 0;
 								        for (int l = 0; l < 4; ++l) {
 								            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
 								                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
 								                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
 								                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += sum;
 								#endif
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = tid * K_QUANTS_PER_ITERATION;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0; // partial sum for thread in warp
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float   * y  = yy + i * QK_K + step;
 								        const uint8_t * ql = x[i].ql + step;
 								        const uint8_t * qh = x[i].qh + step;
 								        const int8_t  * s  = x[i].scales;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float d = x[i+0].d;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        float sum = 0;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
 								                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
 								                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
 								                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += sum;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (tid == 0) {
 								        dst[row] = tmp;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
 								    const half * x = (const half *) vx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // automatic half -> float type cast if dfloat == float
 								    v.x = x[ib + iqs + 0];
 								    v.y = x[ib + iqs + 1];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
 								    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (ix >= kx_padded) {
 								        return;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i_padded = iy*kx_padded + ix;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    block_q8_1 * y = (block_q8_1 *) vy;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int ib = i_padded / QK8_1; // block index
 								    const int iqs = i_padded % QK8_1; // quant index
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
 								    float amax = fabsf(xi);
 								    float sum = xi;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    amax = warp_reduce_max(amax);
 								    sum = warp_reduce_sum(sum);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = amax / 127;
 								    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    y[ib].qs[iqs] = q;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (iqs > 0) {
 								        return;
 								    }
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    reinterpret_cast<half&>(y[ib].ds.x) = d;
 								    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 								}
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 								static __global__ void k_get_rows(
 								            const void * src0, const int32_t * src1, dst_t * dst,
 								            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
 								            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
 								            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
 								            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
 								            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
 								    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
 								    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i00 >= ne00) {
 								        return;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
 								    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int ib = i00/qk; // block index
 								    const int iqs = (i00%qk)/qr; // quant index
 								    const int iybs = i00 - i00%qk; // dst block start index
 								    const int y_offset = qr == 1 ? 1 : qk/2;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // dequantize
 								    dfloat2 v;
 								    dequantize_kernel(src0_row, ib, iqs, v);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_row[iybs + iqs + 0]        = v.x;
 								    dst_row[iybs + iqs + y_offset] = v.y;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename src0_t, typename dst_t>
 								static __global__ void k_get_rows_float(
 								            const src0_t * src0, const int32_t * src1, dst_t * dst,
 								            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
 								            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
 								            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
 								            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
 								            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
 								    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
 								    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i00 >= ne00) {
 								        return;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
 								    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst_row[i00] = src0_row[i00];
 								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 								static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
 								    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 16:04:04 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int ib = i/qk; // block index
 								    const int iqs = (i%qk)/qr; // quant index
 								    const int iybs = i - i%qk; // y block start index
 								    const int y_offset = qr == 1 ? 1 : qk/2;
 								    // dequantize
 								    dfloat2 v;
 								    dequantize_kernel(vx, ib, iqs, v);
 								    y[iybs + iqs + 0]        = v.x;
 								    y[iybs + iqs + y_offset] = v.y;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <typename src_t, typename dst_t>
 								static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const src_t * x = (src_t *) vx;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    y[i] = x[i];
 								}
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check>
 								static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
 								#if __CUDA_ARCH__ >= CC_PASCAL
 								    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
 								    const int * x0 = ((int *) vx) + blockIdx.x * nint;
 								    half2 * y2 = (half2 *) (y + i0);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int vals[nint];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
 								        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
 								            break;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ix = ix0 + threadIdx.x;
 								        vals[ix] = x0[ix];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
 								        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
 								            return;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
 								        const half    d = *b0;
 								        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    GGML_UNUSED(vx);
 								    GGML_UNUSED(y);
 								    GGML_UNUSED(k);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_PASCAL
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 								// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q4_0_Q8_1_MMVQ 2
 								#define VDR_Q4_0_Q8_1_MMQ  4
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
 								    const int * v, const int * u, const float & d4, const half2 & ds8) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < vdr; ++i) {
 								        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 								        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // SIMD dot product of quantized values
 								        sumi = __dp4a(vi0, u[2*i+0], sumi);
 								        sumi = __dp4a(vi1, u[2*i+1], sumi);
 								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 ds8f = __half22float2(ds8);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // second part effectively subtracts 8 from each quant value
 								    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q4_1_Q8_1_MMVQ 2
 								#define VDR_Q4_1_Q8_1_MMQ  4
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
 								    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < vdr; ++i) {
 								        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 								        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // SIMD dot product of quantized values
 								        sumi = __dp4a(vi0, u[2*i+0], sumi);
 								        sumi = __dp4a(vi1, u[2*i+1], sumi);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
 								    const float d4d8 = tmp.x;
 								    const float m4s8 = tmp.y;
 								#else
 								    const float2 dm4f = __half22float2(dm4);
 								    const float2 ds8f = __half22float2(ds8);
 								    const float d4d8 = dm4f.x * ds8f.x;
 								    const float m4s8 = dm4f.y * ds8f.y;
 								#endif // GGML_CUDA_F16
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
 								    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q5_0_Q8_1_MMVQ 2
 								#define VDR_Q5_0_Q8_1_MMQ  4
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
 								    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < vdr; ++i) {
 								        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 								        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 								        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 								        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 								        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 								        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 								        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 								        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 								        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 								        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 								        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
 								    }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 ds8f = __half22float2(ds8);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // second part effectively subtracts 16 from each quant value
 								    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q5_1_Q8_1_MMVQ 2
 								#define VDR_Q5_1_Q8_1_MMQ  4
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
 								    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < vdr; ++i) {
 								        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 								        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 								        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 								        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 								        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 								        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
 								        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 								        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 								        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 								        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 								        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 								        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
 								    const float d5d8 = tmp.x;
 								    const float m5s8 = tmp.y;
 								#else
 								    const float2 dm5f = __half22float2(dm5);
 								    const float2 ds8f = __half22float2(ds8);
 								    const float d5d8 = dm5f.x * ds8f.x;
 								    const float m5s8 = dm5f.y * ds8f.y;
 								#endif // GGML_CUDA_F16
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
 								    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q8_0_Q8_1_MMVQ 2
 								#define VDR_Q8_0_Q8_1_MMQ 8
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
 								    const int * v, const int * u, const float & d8_0, const float & d8_1) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < vdr; ++i) {
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(v[i], u[i], sumi);
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d8_0*d8_1 * sumi;
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
 								    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < vdr; ++i) {
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(v[i], u[i], sumi);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef GGML_CUDA_F16
 								    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
 								    const float d8d8 = tmp.x;
 								    const float m8s8 = tmp.y;
 								#else
 								    const float2 dm8f = __half22float2(dm8);
 								    const float2 ds8f = __half22float2(ds8);
 								    const float d8d8 = dm8f.x * ds8f.x;
 								    const float m8s8 = dm8f.y * ds8f.y;
 								#endif // GGML_CUDA_F16
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
 								    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q2_K_Q8_1_MMVQ 1
 								#define VDR_Q2_K_Q8_1_MMQ  2
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 								    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const half2 & dm2, const float * __restrict__ d8) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < QR2_K; ++i) {
 								        const int sc = scales[2*i];
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vi = (v >> (2*i)) & 0x03030303;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // fill int with 4x m
 								        int m = sc >> 4;
 								        m |= m <<  8;
 								        m |= m << 16;
 								        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm2f = __half22float2(dm2);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const half2 & dm2, const float & d8) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi_d = 0;
 								    int sumi_m = 0;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
 								        int sumi_d_sc = 0;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int sc = scales[i0 / (QI8_1/2)];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // fill int with 4x m
 								        int m = sc >> 4;
 								        m |= m <<  8;
 								        m |= m << 16;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int i = i0; i < i0 + QI8_1/2; ++i) {
 								            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
 								            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumi_d += sumi_d_sc * (sc & 0xF);
 								    }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm2f = __half22float2(dm2);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q3_K_Q8_1_MMVQ 1
 								#define VDR_Q3_K_Q8_1_MMQ  2
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 								    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf = 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR3_K; ++i) {
 								        const int isc = scale_offset + 2*i;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int isc_low = isc % (QK_K/32);
 								        const int sc_shift_low = 4 * (isc / (QK_K/32));
 								        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int isc_high = isc % (QK_K/64);
 								        const int sc_shift_high = 2 * (isc / (QK_K/64));
 								        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int sc = (sc_low | sc_high) - 32;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vil = (vl >> (2*i)) & 0x03030303;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vih = ((vh >> i) << 2) & 0x04040404;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vi = __vsubss4(vil, vih);
 								        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d3 * sumf;
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
 								    const float & d3, const float & d8) {
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
 								        int sumi_sc = 0;
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 21:19:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int i = i0; i < i0 + QI8_1/2; ++i) {
 								            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
 								        }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 05:49:20 +00:00
+								    }
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 21:19:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d3*d8 * sumi;
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q4_K_Q8_1_MMVQ 2
 								#define VDR_Q4_K_Q8_1_MMQ  8
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR4_K; ++i) {
 								        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
 								        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
 								        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf_d += d8[i] * (dot1 * sc[i]);
 								        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm4f = __half22float2(dm4);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
+								    NO_DEVICE_CODE;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
 								        int sumi_d = 0;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								        for (int j = 0; j < QI8_1; ++j) {
 								            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float2 ds8f = __half22float2(ds8[i]);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf_d += ds8f.x * (sc[i] * sumi_d);
 								        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm4f = __half22float2(dm4);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define VDR_Q5_K_Q8_1_MMVQ 2
 								#define VDR_Q5_K_Q8_1_MMQ  8
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
 								    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR5_K; ++i) {
 								        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
 								        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
 								        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int v0i = vl0i | vh0i;
 								        const int v1i = vl1i | vh1i;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
 								        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf_d += d8[i] * (dot1 * sc[i]);
 								        sumf_m += d8[i] * (dot2 * m[i]);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm5f = __half22float2(dm5);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return dm5f.x*sumf_d - dm5f.y*sumf_m;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
 								        int sumi_d = 0;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								        for (int j = 0; j < QI8_1; ++j) {
 								            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
 								        }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float2 ds8f = __half22float2(ds8[i]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf_d += ds8f.x * (sc[i] * sumi_d);
 								        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 21:19:47 +00:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float2 dm4f = __half22float2(dm4);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
 								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q6_K_Q8_1_MMVQ 1
 								#define VDR_Q6_K_Q8_1_MMQ  8
 								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 								    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
 								    const float & d, const float * __restrict__ d8) {
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float sumf = 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR6_K; ++i) {
 								        const int sc = scales[4*i];
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
 								    }
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d*sumf;
 								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
 								    const float & d6, const float * __restrict__ d8) {
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
 								        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
 								#pragma unroll
 								        for (int i = i0; i < i0 + 2; ++i) {
 								            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
 								            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
 								            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
 								            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
 								        }
 								        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
 								    }
 								    return d6 * sumf_d;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 10:48:04 +00:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
+								    NO_DEVICE_CODE;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 								    int v[VDR_Q4_0_Q8_1_MMVQ];
 								    int u[2*VDR_Q4_0_Q8_1_MMVQ];
 								#pragma unroll
 								    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
 								        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
 								    }
 								    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh);
 								    GGML_UNUSED(x_sc);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_qs;
 								    *x_dm = (half2 *) tile_x_d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 14:55:43 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI4_0;
 								    const int kqsx = k % QI4_0;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
 								    float * x_dmf = (float *) x_dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
 								        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
 								        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
 								    const float * x_dmf = (const float *) x_dm;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int u[2*VDR_Q4_0_Q8_1_MMQ];
 								#pragma unroll
 								    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
 								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
 								    }
 								    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
 								         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int v[VDR_Q4_1_Q8_1_MMVQ];
 								    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
 								        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 17:44:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_qs;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    *x_dm = tile_x_dm;
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 14:55:43 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI4_1;
 								    const int kqsx = k % QI4_1;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								    }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
 								        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
 								        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
 								        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
 								    int u[2*VDR_Q4_1_Q8_1_MMQ];
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
 								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
 								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
 								    }
 								    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
 								         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 								}
 								static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 								    int vl[VDR_Q5_0_Q8_1_MMVQ];
 								    int vh[VDR_Q5_0_Q8_1_MMVQ];
 								    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
 								#pragma unroll
 								    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
 								        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
 								        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
 								    }
 								    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 								}
 								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 								    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
 								    *x_ql = tile_x_ql;
 								    *x_dm = (half2 *) tile_x_d;
 								}
 								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
 								    const int kbx  = k / QI5_0;
 								    const int kqsx = k % QI5_0;
 								    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
 								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ql = get_int_from_uint8(bxi->qs, kqsx);
 								        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
 								        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
 								        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
 								        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
 								        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
 								        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
 								        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
 								        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
 								        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
 								        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
 								        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
 								        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
 								        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								    float * x_dmf = (float *) x_dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
 								        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 11:18:51 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
 								    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 16:20:44 +00:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int u[2*VDR_Q5_0_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
 								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
 								    }
 								    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
 								        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
 								    int vl[VDR_Q5_1_Q8_1_MMVQ];
 								    int vh[VDR_Q5_1_Q8_1_MMVQ];
 								    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
 								#pragma unroll
 								    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
 								        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
 								        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
 								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset < nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI5_1;
 								    const int kqsx = k % QI5_1;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
 								        int qs0 = (ql >>  0) & 0x0F0F0F0F;
 								        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
 								        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
 								        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
 								        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
 								        int qs1 = (ql >>  4) & 0x0F0F0F0F;
 								        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
 								        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
 								        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
 								        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
 								        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
 								        if (need_check) {
 								            i = min(i, i_max);
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 06:51:49 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
 								        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
 								    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
 								    int u[2*VDR_Q5_1_Q8_1_MMQ];
 								#pragma unroll
 								    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
 								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
 								    }
 								    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
 								        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int v[VDR_Q8_0_Q8_1_MMVQ];
 								    int u[VDR_Q8_0_Q8_1_MMVQ];
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
 								        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
 								        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 								}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 								    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
 								    *x_ql = tile_x_qs;
 								    *x_dm = (half2 *) tile_x_d;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI8_0;
 								    const int kqsx = k % QI8_0;
 								    float * x_dmf = (float *) x_dm;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
 								        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
 								        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
 								    }
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
 								         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int bq8_offset = QR2_K * (iqs / QI8_1);
 								    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * scales = bq2_K->scales + scale_offset;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
 								    int    u[QR2_K];
 								    float d8[QR2_K];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < QR2_K; ++ i) {
 								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
 								        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI2_K;
 								    const int kqsx = k % QI2_K;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q2_K * bx0 = (const block_q2_K *) vx;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
 								        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
 								    }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
 								        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
 								    }
 								}
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx = k / QI2_K;
 								    const int ky  = (k % QI2_K) * QR2_K;
 								    const float * y_df = (const float *) y_ds;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
 								    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
 								        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
 								    }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
 								    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
 								    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = bq3_K->d;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
 								    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int    u[QR3_K];
 								    float d8[QR3_K];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR3_K; ++i) {
 								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
 								        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
 								    }
 								    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
 								    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_qh = tile_x_qh;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI3_K;
 								    const int kqsx = k % QI3_K;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q3_K * bx0 = (const block_q3_K *) vx;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
 								    }
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								    float * x_dmf = (float *) x_dm;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
 								        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
 								    }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
 								        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
 								        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
 								    }
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
 								        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ksc = k % (QI3_K/4);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ksc_low = ksc % (QI3_K/8);
 								        const int shift_low = 4 * (ksc / (QI3_K/8));
 								        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ksc_high = QI3_K/8;
 								        const int shift_high = 2 * ksc;
 								        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
 								    }
 								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI3_K;
 								    const int ky  = (k % QI3_K) * QR3_K;
 								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
 								#pragma unroll
 								    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
 								        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
 								        const int shift = 2 * ((ky % 32) / 8);
 								        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
 								        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
 								        const int vlh = (vh << 2) & 0x04040404;
 								        v[l] = __vsubss4(vll, vlh);
 								    }
 								    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
 								    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifndef GGML_QKK_64
 								    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int    v[2];
 								    int    u[2*QR4_K];
 								    float d8[QR4_K];
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
 								    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
 								    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
 								    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
 								    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
 								    v[0] = q4[0];
 								    v[1] = q4[4];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
 								    uint16_t aux[2];
 								    const int j = bq8_offset/2;
 								    if (j < 2) {
 								        aux[0] = scales[j+0] & 0x3f3f;
 								        aux[1] = scales[j+2] & 0x3f3f;
 								    } else {
 								        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
 								        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
 								    }
 								    const uint8_t * sc = (const uint8_t *)aux;
 								    const uint8_t * m  = sc + 2;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < QR4_K; ++i) {
 								        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
 								        d8[i] = __low2float(bq8i->ds);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
 								        u[2*i+0] = q8[0];
 								        u[2*i+1] = q8[4];
 								    }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    uint16_t aux16[2];
 								    const uint8_t * s = (const uint8_t *)aux16;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint16_t * a = (const uint16_t *)bq4_K->scales;
 								    aux16[0] = a[0] & 0x0f0f;
 								    aux16[1] = (a[0] >> 4) & 0x0f0f;
 								    const float dall = bq4_K->dm[0];
 								    const float dmin = bq4_K->dm[1];
 								    const float d8_1 = __low2float(bq8_1[0].ds);
 								    const float d8_2 = __low2float(bq8_1[1].ds);
 								    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 								    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 								    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 								    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
 								    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
 								    const int v1 = q4[0];
 								    const int v2 = q4[4];
 								    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
 								    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
 								    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
 								    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
 								    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
 								    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
 								    return dall * sumf_d - dmin * sumf_m;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 12:31:56 +00:00
+								    NO_DEVICE_CODE;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
+								#endif
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh);
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 13:44:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 13:44:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI4_K; // == k if QK_K == 256
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q4_K * bx0 = (const block_q4_K *) vx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
 								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 20:56:40 +00:00
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
+								#pragma unroll
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
 								        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 13:44:06 +00:00
+								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
 								#else
 								        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
 								#endif
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int * scales = (const int *) bxi->scales;
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 21:07:29 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ksc = k % (WARP_SIZE/8);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
 								        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
 								        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 21:07:29 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
 								    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
 								                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifndef GGML_QKK_64
 								    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int   vl[2];
 								    int   vh[2];
 								    int    u[2*QR5_K];
 								    float d8[QR5_K];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
 								    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
 								    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    vl[0] = ql[0];
 								    vl[1] = ql[4];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    vh[0] = qh[0] >> bq8_offset;
 								    vh[1] = qh[4] >> bq8_offset;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
 								    uint16_t aux[2];
 								    const int j = bq8_offset/2;
 								    if (j < 2) {
 								        aux[0] = scales[j+0] & 0x3f3f;
 								        aux[1] = scales[j+2] & 0x3f3f;
 								    } else {
 								        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
 								        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * sc = (const uint8_t *)aux;
 								    const uint8_t * m  = sc + 2;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR5_K; ++i) {
 								        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
 								        d8[i] = __low2float(bq8i->ds);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
 								        u[2*i+0] = q8[0];
 								        u[2*i+1] = q8[4];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int8_t * s = bq5_K->scales;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = bq5_K->d;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d8_1 = __low2half(bq8_1[0].ds);
 								    const float d8_2 = __low2half(bq8_1[1].ds);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 								    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 								    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 								    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int * ql = (const int *)bq5_K->qs + (iqs/2);
 								    const int vl1 = ql[0];
 								    const int vl2 = ql[4];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int step = 4 * (iqs/2); // 0, 4, 8, 12
 								    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
 								    const int in = step%8; // 0, 4, 0, 4
 								    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
 								    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
 								    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
 								    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
 								                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return d * sumf_d;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI5_K; // == k if QK_K == 256
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q5_K * bx0 = (const block_q5_K *) vx;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
 								        const int ky = QR5_K*kqsx;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
 								        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
 								        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
 								        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
 								        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
 								        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
 								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
 								        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
 								        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
 								#endif
 								    }
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int * scales = (const int *) bxi->scales;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ksc = k % (WARP_SIZE/8);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
 								        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
 								        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
 								    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
 								    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
 								                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
 								}
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
 								    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
 								    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
 								    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int8_t * scales = bq6_K->scales + scale_offset;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int    u[QR6_K];
 								    float d8[QR6_K];
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i = 0; i < QR6_K; ++i) {
 								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
 								        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 22:04:33 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
 								    GGML_UNUSED(x_qh);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 22:04:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 22:04:33 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
 								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
 								    GGML_UNUSED(x_qh);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 22:04:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI6_K; // == k if QK_K == 256
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q6_K * bx0 = (const block_q6_K *) vx;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
 								        int i = i0 + i_offset;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 08:55:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
 								        const int ky = QR6_K*kqsx;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 08:55:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ql = get_int_from_uint8(bxi->ql, kqsx);
 								        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
 								        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 17:04:31 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
 								        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
 								        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
 								        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
 								        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
 								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 								    float * x_dmf = (float *) x_dm;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 08:55:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
 								        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 08:55:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												llm : add Falcon support (#2717)

* llama : refactor GGUF constants into static maps

* llama : check if model architecture is known

* llama : refactor llama_model_load_internal()

* gguf : add KV constant maps

* llm : read arch-specific KVs

* convert : add dummy scores + types

* falcon : load tensor data (CPU only)

* llama : fix loading progress bar

* llama : add arch member to llama_model

* falcon : CPU inference working

* falcon : support non-40B models

* falcon : minor

* llama : minor updates

ggml-ci

* convert-falcon-hf-to-gguf.py : fix special token mapping

* llama.cpp : llama default UNK token = id 0

* llama.cpp : fix bpe tokenizer

* llama.cpp : fix the fix of bpe tokenizer

* ggml : pass eps to ggml_norm

* metal : implement RoPE (mode = 2) + avoid ggml_repeat

* ggml : ggml_repeat always creates new tensor

* falcon : copy-paste self-attention from LLaMA

* metal : print extra compute pipeline info

* falcon : minor changes (still chasing the Metal problem)

* llama.cpp : fix linefeed token

* metal : fix GELU kernel numerical stability by using precise::tanh

* metal : temporary workaround for the concurrency optimization bug

* falcon : add CUDA offloading (#2739)

* llama : better model naming and size reporting

* llama : prep new tokenizer support

* llama : advanced BPE tokenizer based on ggllm.cpp imlpementation

* llama : remove oboslete comment

ggml-ci

* common : remove obsolete BPE API + disable test-tokenizer-1

* llama : revert BPE special-case in llama_byte_to_token()

* cuda : add TODOs for RoPE NeoX implementation

* llama : default special tokens based on vocab type

* perplexity : add log for start of tokenization

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-08-23 20:08:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    GGML_UNUSED(x_qh);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
 								    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
 								    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
 								#if QR2_XXS == 8
 								    const int ib32 = iqs;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    uint32_t aux32 = q2[2] | (q2[3] << 16);
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
 								        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
 								        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
 								        for (int j = 0; j < 8; ++j) {
 								            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
 								        }
 								        q8 += 8;
 								        aux32 >>= 7;
 								    }
 								    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
 								    return d * sumi;
 								#else
 								    // iqs is 0...15
 								    const int ib32 = iqs/2;
 								    const int il = iqs%2;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
 								    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
 								    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
 								    const uint32_t aux32 = q2[2] | (q2[3] << 16);
 								    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
 								    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
 								    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
 								    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int j = 0; j < 8; ++j) {
 								        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
 								        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
 								    }
 								    return d * (sumi1 + sumi2);
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
 								    const int ib32 = iqs;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
 								    const uint8_t ls2 = bq2->scales[ib32] >>  4;
 								    int sumi1 = 0;
 								    for (int l = 0; l < 2; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
 								        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
 								        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
 								        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
 								        q8 += 8;
 								    }
 								    int sumi2 = 0;
 								    for (int l = 2; l < 4; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
 								        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
 								        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
 								        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
 								        q8 += 8;
 								    }
 								    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
 								    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 								#else
 								    GGML_UNUSED(ksigns64);
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    GGML_UNUSED(ksigns64);
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								// TODO
 								static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
 								    const int ib32 = iqs;
 								    const int8_t  * q8 = bq8_1[ib32].qs;
 								    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
 								    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
 								    const uint8_t ls2 = bq2->scales[ib32] >>  4;
 								    int sumi1 = 0;
 								    for (int l = 0; l < 2; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
 								        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
 								        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
 								        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
 								        q8 += 8;
 								    }
 								    int sumi2 = 0;
 								    for (int l = 2; l < 4; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
 								        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
 								        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
 								        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
 								        q8 += 8;
 								    }
 								    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
 								    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 								#else
 								    GGML_UNUSED(ksigns64);
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    GGML_UNUSED(ksigns64);
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
 								    const int ib32 = iqs;
 								    const uint8_t  * q3 = bq2->qs + 8*ib32;
 								    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    uint32_t aux32 = gas[0] | (gas[1] << 16);
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
 								        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
 								        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
 								        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
 								        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
 								        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
 								        q8 += 8;
 								        aux32 >>= 7;
 								    }
 								    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
 								    return d * sumi;
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								// TODO: don't use lookup table for signs
 								static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
 								    const int ib32 = iqs;
 								    const uint8_t  * qs = bq2->qs + 8*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
 								        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
 								        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
 								        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
 								        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
 								        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
 								        q8 += 8;
 								    }
 								    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
 								    return d * sumi;
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
 								    const int ib32 = iqs;
 								    int sumi = 0;
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const int * q8 = (const int *)bq8_1[ib32].qs;
 								    for (int l = 0; l < 4; ++l) {
 								        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
 								        int grid0 = grid[0] & 0x0f0f0f0f;
 								        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
 								        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
 								    }
 								#else
 								    const int8_t * q8 = bq8_1[ib32].qs;
 								    for (int l = 0; l < 4; ++l) {
 								        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
 								        for (int j = 0; j < 4; ++j) {
 								            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
 								        }
 								        q8 += 8;
 								    }
 								#endif
 								    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
 								    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
 								    const float d = d1q * __low2float (bq8_1[ib32].ds);
 								    const float m = d1q * __high2float(bq8_1[ib32].ds);
 								    return d * sumi + m * delta;
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
 								        int & val1, int & val2) {
 								    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
 								    aux32 = q4 & 0x0f0f0f0f;
 								    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
 								    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
 								    val1 = v1 | (v2 << 16);
 								    aux32 = (q4 >> 4) & 0x0f0f0f0f;
 								    v1 = values[q8[0]] | (values[q8[1]] << 8);
 								    v2 = values[q8[2]] | (values[q8[3]] << 8);
 								    val2 = v1 | (v2 << 16);
 								}
 								#endif
 								static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
 								    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
 								    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 								    int v1, v2;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
 								        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
 								        get_int_from_table_16(aux, values, v1, v2);
 								        sumi1 = __dp4a(v1, q8[l+0], sumi1);
 								        sumi2 = __dp4a(v2, q8[l+4], sumi2);
 								    }
 								#else
 								    const uint8_t * q4 = bq->qs + 4*iqs;
 								    const int8_t  * q8 = bq8_1->qs + 4*iqs;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
 								        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
 								        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
 								    }
 								#endif
 								    const float d = (float)bq->d * __low2float(bq8_1->ds);
 								    return d * (sumi1 + sumi2);
 								}
 								static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
 								    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 								    //// iqs is 0...7
 								    //const int ib64 = iqs/2;
 								    //const int il = iqs%2;
 								    //const int32_t  * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
 								    //const int32_t  * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
 								    //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
 								    //const uint32_t * q4_2 = q4_1 + 4;
 								    //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
 								    //const int8_t ls2 = (bq4->scales_l[ib64] >>  4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
 								    //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
 								    //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
 								    //int v1, v2;
 								    //int sumi1 = 0, sumi2 = 0;
 								    //for (int j = 0; j < 2; ++j) {
 								    //    get_int_from_table_16(q4_1[j], values, v1, v2);
 								    //    sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
 								    //    get_int_from_table_16(q4_2[j], values, v1, v2);
 								    //    sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
 								    //}
 								    //return d1 * sumi1 + d2 * sumi2;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // iqs is 0...7
 								    const int ib32 = iqs;
 								    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
 								    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
 								    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
 								    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
 								    int v1, v2;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int j = 0; j < 4; ++j) {
 								        get_int_from_table_16(q4[j], values, v1, v2);
 								        sumi1 = __dp4a(v1, q8[j+0], sumi1);
 								        sumi2 = __dp4a(v2, q8[j+4], sumi2);
 								    }
 								    return d * (sumi1 + sumi2);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //// iqs is 0...15
 								    //const int ib32 = iqs/2;
 								    //const int il = iqs%2;
 								    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs + 2*il;
 								    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
 								    //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
 								    //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
 								    //int v1, v2;
 								    //int sumi1 = 0, sumi2 = 0;
 								    //for (int j = 0; j < 2; ++j) {
 								    //    get_int_from_table_16(q4[j], values, v1, v2);
 								    //    sumi1 = __dp4a(v1, q8[j+0], sumi1);
 								    //    sumi2 = __dp4a(v2, q8[j+4], sumi2);
 								    //}
 								    //return d * (sumi1 + sumi2);
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
 								#endif
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
 								              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 								static __device__ __forceinline__ void mul_mat_q(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q_t  * x = (const block_q_t  *) vx;
 								    const block_q8_1 * y = (const block_q8_1 *) vy;
 								    const int blocks_per_row_x = ncols_x / qk;
 								    const int blocks_per_col_y = nrows_y / QK8_1;
 								    const int blocks_per_warp = WARP_SIZE / qi;
 								    const int & ncols_dst = ncols_y;
 								    const int row_dst_0 = blockIdx.x*mmq_y;
 								    const int & row_x_0 = row_dst_0;
 								    const int col_dst_0 = blockIdx.y*mmq_x;
 								    const int & col_y_0 = col_dst_0;
 								    int   * tile_x_ql = nullptr;
 								    half2 * tile_x_dm = nullptr;
 								    int   * tile_x_qh = nullptr;
 								    int   * tile_x_sc = nullptr;
 								    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
 								    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
 								    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
 								    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
 								    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
 								        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
 								                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
 								#pragma unroll
 								        for (int ir = 0; ir < qr; ++ir) {
 								            const int kqs = ir*WARP_SIZE + threadIdx.x;
 								            const int kbxd = kqs / QI8_1;
 								#pragma unroll
 								            for (int i = 0; i < mmq_x; i += nwarps) {
 								                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
 								                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
 								                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
 								                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
 								            }
 								#pragma unroll
 								            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
 								                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
 								                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
 								                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
 								                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
 								                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
 								                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
 								                if (need_sum) {
 								                    *dsi_dst = *dsi_src;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								                } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                    float * dfi_dst = (float *) dsi_dst;
 								                    *dfi_dst = __low2float(*dsi_src);
 								                }
 								            }
 								            __syncthreads();
 								// #pragma unroll // unrolling this loop causes too much register pressure
 								            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
 								#pragma unroll
 								                for (int j = 0; j < mmq_x; j += nwarps) {
 								#pragma unroll
 								                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
 								                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
 								                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
 								                            threadIdx.x + i, threadIdx.y + j, k);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								                    }
 								                }
 								            }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								            __syncthreads();
 								        }
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								#pragma unroll
 								    for (int j = 0; j < mmq_x; j += nwarps) {
 								        const int col_dst = col_dst_0 + j + threadIdx.y;
 								        if (col_dst >= ncols_dst) {
 								            return;
 								        }
 								#pragma unroll
 								        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
 								            const int row_dst = row_dst_0 + threadIdx.x + i;
 								            if (row_dst >= nrows_dst) {
 								                continue;
 								            }
 								            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
 								        }
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q4_0_RDNA2  64
 								#define  MMQ_Y_Q4_0_RDNA2  128
 								#define NWARPS_Q4_0_RDNA2  8
 								#define  MMQ_X_Q4_0_RDNA1  64
 								#define  MMQ_Y_Q4_0_RDNA1  64
 								#define NWARPS_Q4_0_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_0_AMPERE 4
 								#define  MMQ_Y_Q4_0_AMPERE 32
 								#define NWARPS_Q4_0_AMPERE 4
 								#else
 								#define  MMQ_X_Q4_0_AMPERE 64
 								#define  MMQ_Y_Q4_0_AMPERE 128
 								#define NWARPS_Q4_0_AMPERE 4
 								#endif
 								#define  MMQ_X_Q4_0_PASCAL 64
 								#define  MMQ_Y_Q4_0_PASCAL 64
 								#define NWARPS_Q4_0_PASCAL 8
 								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q4_0(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
 								    const int nwarps = NWARPS_Q4_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
 								    const int nwarps = NWARPS_Q4_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
 								    const int nwarps = NWARPS_Q4_0_AMPERE;
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
 								    const int nwarps = NWARPS_Q4_0_PASCAL;
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
 								#define  MMQ_X_Q4_1_RDNA2  64
 								#define  MMQ_Y_Q4_1_RDNA2  128
 								#define NWARPS_Q4_1_RDNA2  8
 								#define  MMQ_X_Q4_1_RDNA1  64
 								#define  MMQ_Y_Q4_1_RDNA1  64
 								#define NWARPS_Q4_1_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_1_AMPERE 4
 								#define  MMQ_Y_Q4_1_AMPERE 32
 								#define NWARPS_Q4_1_AMPERE 4
 								#else
 								#define  MMQ_X_Q4_1_AMPERE 64
 								#define  MMQ_Y_Q4_1_AMPERE 128
 								#define NWARPS_Q4_1_AMPERE 4
 								#endif
 								#define  MMQ_X_Q4_1_PASCAL 64
 								#define  MMQ_Y_Q4_1_PASCAL 64
 								#define NWARPS_Q4_1_PASCAL 8
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#elif __CUDA_ARCH__ < CC_VOLTA
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
 								#endif // __CUDA_ARCH__ < CC_VOLTA
 								    mul_mat_q4_1(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
 								    const int nwarps = NWARPS_Q4_1_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
 								    const int nwarps = NWARPS_Q4_1_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
 								    const int nwarps = NWARPS_Q4_1_AMPERE;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
 								    const int nwarps = NWARPS_Q4_1_PASCAL;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q5_0_RDNA2  64
 								#define  MMQ_Y_Q5_0_RDNA2  128
 								#define NWARPS_Q5_0_RDNA2  8
 								#define  MMQ_X_Q5_0_RDNA1  64
 								#define  MMQ_Y_Q5_0_RDNA1  64
 								#define NWARPS_Q5_0_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_0_AMPERE 4
 								#define  MMQ_Y_Q5_0_AMPERE 32
 								#define NWARPS_Q5_0_AMPERE 4
 								#else
 								#define  MMQ_X_Q5_0_AMPERE 128
 								#define  MMQ_Y_Q5_0_AMPERE 64
 								#define NWARPS_Q5_0_AMPERE 4
 								#endif
 								#define  MMQ_X_Q5_0_PASCAL 64
 								#define  MMQ_Y_Q5_0_PASCAL 64
 								#define NWARPS_Q5_0_PASCAL 8
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q5_0(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
 								    const int nwarps = NWARPS_Q5_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
 								    const int nwarps = NWARPS_Q5_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
 								    const int nwarps = NWARPS_Q5_0_AMPERE;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
 								    const int nwarps = NWARPS_Q5_0_PASCAL;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q5_1_RDNA2  64
 								#define  MMQ_Y_Q5_1_RDNA2  128
 								#define NWARPS_Q5_1_RDNA2  8
 								#define  MMQ_X_Q5_1_RDNA1  64
 								#define  MMQ_Y_Q5_1_RDNA1  64
 								#define NWARPS_Q5_1_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_1_AMPERE 4
 								#define  MMQ_Y_Q5_1_AMPERE 32
 								#define NWARPS_Q5_1_AMPERE 4
 								#else
 								#define  MMQ_X_Q5_1_AMPERE 128
 								#define  MMQ_Y_Q5_1_AMPERE 64
 								#define NWARPS_Q5_1_AMPERE 4
 								#endif
 								#define  MMQ_X_Q5_1_PASCAL 64
 								#define  MMQ_Y_Q5_1_PASCAL 64
 								#define NWARPS_Q5_1_PASCAL 8
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 21:04:16 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q5_1(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
 								    const int nwarps = NWARPS_Q5_1_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
 								    const int nwarps = NWARPS_Q5_1_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
 								    const int nwarps = NWARPS_Q5_1_AMPERE;
-												Fix CUDA softmax by subtracting max value before exp (#2665)


											
										
										
											2023-08-22 18:27:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
 								    const int nwarps = NWARPS_Q5_1_PASCAL;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q8_0_RDNA2  64
 								#define  MMQ_Y_Q8_0_RDNA2  128
 								#define NWARPS_Q8_0_RDNA2  8
 								#define  MMQ_X_Q8_0_RDNA1  64
 								#define  MMQ_Y_Q8_0_RDNA1  64
 								#define NWARPS_Q8_0_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q8_0_AMPERE 4
 								#define  MMQ_Y_Q8_0_AMPERE 32
 								#define NWARPS_Q8_0_AMPERE 4
 								#else
 								#define  MMQ_X_Q8_0_AMPERE 128
 								#define  MMQ_Y_Q8_0_AMPERE 64
 								#define NWARPS_Q8_0_AMPERE 4
 								#endif
 								#define  MMQ_X_Q8_0_PASCAL 64
 								#define  MMQ_Y_Q8_0_PASCAL 64
 								#define NWARPS_Q8_0_PASCAL 8
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q8_0(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
 								    const int nwarps = NWARPS_Q8_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
 								    const int nwarps = NWARPS_Q8_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 08:51:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
 								    const int nwarps = NWARPS_Q8_0_AMPERE;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
 								    const int nwarps = NWARPS_Q8_0_PASCAL;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 07:58:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q2_K_RDNA2  64
 								#define  MMQ_Y_Q2_K_RDNA2  128
 								#define NWARPS_Q2_K_RDNA2  8
 								#define  MMQ_X_Q2_K_RDNA1  128
 								#define  MMQ_Y_Q2_K_RDNA1  32
 								#define NWARPS_Q2_K_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q2_K_AMPERE 4
 								#define  MMQ_Y_Q2_K_AMPERE 32
 								#define NWARPS_Q2_K_AMPERE 4
 								#else
 								#define  MMQ_X_Q2_K_AMPERE 64
 								#define  MMQ_Y_Q2_K_AMPERE 128
 								#define NWARPS_Q2_K_AMPERE 4
 								#endif
 								#define  MMQ_X_Q2_K_PASCAL 64
 								#define  MMQ_Y_Q2_K_PASCAL 64
 								#define NWARPS_Q2_K_PASCAL 8
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q2_K(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
 								    const int nwarps = NWARPS_Q2_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
 								    const int nwarps = NWARPS_Q2_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
 								    const int nwarps = NWARPS_Q2_K_AMPERE;
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
 								    const int nwarps = NWARPS_Q2_K_PASCAL;
 								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
+								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q3_K_RDNA2  128
 								#define  MMQ_Y_Q3_K_RDNA2  64
 								#define NWARPS_Q3_K_RDNA2  8
 								#define  MMQ_X_Q3_K_RDNA1  32
 								#define  MMQ_Y_Q3_K_RDNA1  128
 								#define NWARPS_Q3_K_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q3_K_AMPERE 4
 								#define  MMQ_Y_Q3_K_AMPERE 32
 								#define NWARPS_Q3_K_AMPERE 4
 								#else
 								#define  MMQ_X_Q3_K_AMPERE 128
 								#define  MMQ_Y_Q3_K_AMPERE 128
 								#define NWARPS_Q3_K_AMPERE 4
 								#endif
 								#define  MMQ_X_Q3_K_PASCAL 64
 								#define  MMQ_Y_Q3_K_PASCAL 64
 								#define NWARPS_Q3_K_PASCAL 8
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#elif __CUDA_ARCH__ < CC_VOLTA
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
 								#endif // __CUDA_ARCH__ < CC_VOLTA
 								    mul_mat_q3_K(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
 								    const int nwarps = NWARPS_Q3_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
 								    const int nwarps = NWARPS_Q3_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
 								    const int nwarps = NWARPS_Q3_K_AMPERE;
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
 								    const int nwarps = NWARPS_Q3_K_PASCAL;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q4_K_RDNA2  64
 								#define  MMQ_Y_Q4_K_RDNA2  128
 								#define NWARPS_Q4_K_RDNA2  8
 								#define  MMQ_X_Q4_K_RDNA1  32
 								#define  MMQ_Y_Q4_K_RDNA1  64
 								#define NWARPS_Q4_K_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_K_AMPERE 4
 								#define  MMQ_Y_Q4_K_AMPERE 32
 								#define NWARPS_Q4_K_AMPERE 4
 								#else
 								#define  MMQ_X_Q4_K_AMPERE 64
 								#define  MMQ_Y_Q4_K_AMPERE 128
 								#define NWARPS_Q4_K_AMPERE 4
 								#endif
 								#define  MMQ_X_Q4_K_PASCAL 64
 								#define  MMQ_Y_Q4_K_PASCAL 64
 								#define NWARPS_Q4_K_PASCAL 8
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#elif __CUDA_ARCH__ < CC_VOLTA
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
 								#endif // __CUDA_ARCH__ < CC_VOLTA
 								    mul_mat_q4_K(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
 								    const int nwarps = NWARPS_Q4_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
 								    const int nwarps = NWARPS_Q4_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
 								    const int nwarps = NWARPS_Q4_K_AMPERE;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
 								    const int nwarps = NWARPS_Q4_K_PASCAL;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q5_K_RDNA2  64
 								#define  MMQ_Y_Q5_K_RDNA2  128
 								#define NWARPS_Q5_K_RDNA2  8
 								#define  MMQ_X_Q5_K_RDNA1  32
 								#define  MMQ_Y_Q5_K_RDNA1  64
 								#define NWARPS_Q5_K_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_K_AMPERE 4
 								#define  MMQ_Y_Q5_K_AMPERE 32
 								#define NWARPS_Q5_K_AMPERE 4
 								#else
 								#define  MMQ_X_Q5_K_AMPERE 64
 								#define  MMQ_Y_Q5_K_AMPERE 128
 								#define NWARPS_Q5_K_AMPERE 4
 								#endif
 								#define  MMQ_X_Q5_K_PASCAL 64
 								#define  MMQ_Y_Q5_K_PASCAL 64
 								#define NWARPS_Q5_K_PASCAL 8
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q5_K(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
 								    const int nwarps = NWARPS_Q5_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
 								    const int nwarps = NWARPS_Q5_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
 								    const int nwarps = NWARPS_Q5_K_AMPERE;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
 								    const int nwarps = NWARPS_Q5_K_PASCAL;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#define  MMQ_X_Q6_K_RDNA2  64
 								#define  MMQ_Y_Q6_K_RDNA2  128
 								#define NWARPS_Q6_K_RDNA2  8
 								#define  MMQ_X_Q6_K_RDNA1  32
 								#define  MMQ_Y_Q6_K_RDNA1  64
 								#define NWARPS_Q6_K_RDNA1  8
 								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q6_K_AMPERE 4
 								#define  MMQ_Y_Q6_K_AMPERE 32
 								#define NWARPS_Q6_K_AMPERE 4
 								#else
 								#define  MMQ_X_Q6_K_AMPERE 64
 								#define  MMQ_Y_Q6_K_AMPERE 64
 								#define NWARPS_Q6_K_AMPERE 4
 								#endif
 								#define  MMQ_X_Q6_K_PASCAL 64
 								#define  MMQ_Y_Q6_K_PASCAL 64
 								#define NWARPS_Q6_K_PASCAL 8
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#elif __CUDA_ARCH__ < CC_VOLTA
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
 								#endif // __CUDA_ARCH__ < CC_VOLTA
 								    mul_mat_q6_K(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
 								    const int nwarps = NWARPS_Q6_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
 								    const int nwarps = NWARPS_Q6_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
 								    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
 								    const int nwarps = NWARPS_Q6_K_AMPERE;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
 								    const int nwarps = NWARPS_Q6_K_PASCAL;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
 								    NO_DEVICE_CODE;
 								#endif // __CUDA_ARCH__ >= CC_VOLTA
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 								#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 								// tell the compiler to use as many registers as it wants, see nwarps definition below
 								__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
 								#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 								static __global__ void mul_mat_vec_q(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
 								    constexpr int nwarps              = 1;
 								    constexpr int rows_per_cuda_block = 1;
 								#else
 								    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
 								    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
 								    const     int row0 = rows_per_cuda_block*blockIdx.x;
 								    const     int blocks_per_row_x = ncols_x / qk;
 								    const     int blocks_per_col_y = nrows_y / QK8_1;
 								    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// partial sum for each thread
 								    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const block_q_t  * x = (const block_q_t  *) vx;
 								    const block_q8_1 * y = (const block_q8_1 *) vy;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
 								        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // x block quant index when casting the quants to int
 								        const int kqs = vdr * (tid % (qi/vdr));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								        for (int j = 0; j < ncols_y; ++j) {
 								#pragma unroll
 								            for (int i = 0; i < rows_per_cuda_block; ++i) {
 								                tmp[j][i] += vec_dot_q_cuda(
 								                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
 								            }
 								        }
 								    }
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 11:49:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
 								    if (threadIdx.y > 0) {
 								#pragma unroll
 								        for (int j = 0; j < ncols_y; ++j) {
 								#pragma unroll
 								            for (int i = 0; i < rows_per_cuda_block; ++i) {
 								                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								            }
 								        }
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __syncthreads();
 								    if (threadIdx.y > 0) {
 								        return;
 								    }
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								#pragma unroll
 								    for (int j = 0; j < ncols_y; ++j) {
 								#pragma unroll
 								        for (int i = 0; i < rows_per_cuda_block; ++i) {
 								#pragma unroll
 								            for (int l = 0; l < nwarps-1; ++l) {
 								                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
 								            }
 								            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
 								        }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (threadIdx.x < rows_per_cuda_block) {
 								            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
 								        }
 								    }
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 								static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
 								    // qk = quantized weights per x block
 								    // qr = number of quantized weights per data value in x block
 								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (row >= nrows) {
 								        return;
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid = threadIdx.x;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int iter_stride = 2*GGML_CUDA_DMMV_X;
 								    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
 								    const int y_offset = qr == 1 ? 1 : qk/2;
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// partial sum for each thread
 								#ifdef GGML_CUDA_F16
 								    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
 								#else
 								    float tmp = 0.0f;
 								#endif // GGML_CUDA_F16
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i = 0; i < ncols; i += iter_stride) {
 								        const int col = i + vals_per_iter*tid;
 								        const int ib = (row*ncols + col)/qk; // x block index
 								        const int iqs = (col%qk)/qr; // x quant index
 								        const int iybs = col - col%qk; // y block start index
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// processing >2 values per i iter is faster for fast GPUs
 								#pragma unroll
 								        for (int j = 0; j < vals_per_iter; j += 2) {
 								            // process 2 vals per j iter
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // dequantize
 								            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
 								            dfloat2 v;
 								            dequantize_kernel(vx, ib, iqs + j/qr, v);
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // matrix multiplication
 								            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 								#ifdef GGML_CUDA_F16
 								            tmp += __hmul2(v, {
 								                y[iybs + iqs + j/qr + 0],
 								                y[iybs + iqs + j/qr + y_offset]
 								            });
 								#else
 								            tmp += v.x * y[iybs + iqs + j/qr + 0];
 								            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
 								#endif // GGML_CUDA_F16
 								        }
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 06:53:30 +00:00
+								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (tid == 0) {
 								#ifdef GGML_CUDA_F16
 								        dst[row] = tmp.x + tmp.y;
 								#else
 								        dst[row] = tmp;
 								#endif // GGML_CUDA_F16
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void mul_mat_p021_f16_f32(
 								    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const half * x = (const half *) vx;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
 								    const int channel_x = channel / (nchannels_y / nchannels_x);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int nrows_y = ncols_x;
 								    const int nrows_dst = nrows_x;
 								    const int row_dst = row_x;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0.0f;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
 								        const int col_x = col_x0 + threadIdx.x;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (col_x >= ncols_x) {
 								            break;
 								        }
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 20:40:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // x is transposed and permuted
 								        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
 								        const float xi = __half2float(x[ix]);
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 19:38:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int row_y = col_x;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // y is not transposed but permuted
 								        const int iy = channel*nrows_y + row_y;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += xi * y[iy];
 								    }
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // dst is not transposed and not permuted
 								    const int idst = channel*nrows_dst + row_dst;
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 05:48:06 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (threadIdx.x == 0) {
 								        dst[idst] = tmp;
 								    }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
 								    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
 								    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const half * x = (const half *) vx;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
 								    const int channel_x = channel / channel_x_divisor;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int nrows_y   = ncols_x;
 								    const int nrows_dst = nrows_x;
 								    const int row_dst   = row_x;
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int idst = channel*nrows_dst + row_dst;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0.0f;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 14:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
 								        const int col_x = col_x0 + threadIdx.x;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 16:16:55 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (col_x >= ncols_x) {
 								            break;
 								        }
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int row_y = col_x;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
 								        const int iy = channel*nrows_y + row_y;
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 16:24:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float xi = __half2float(x[ix]);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp += xi * y[iy];
 								    }
 								    // sum up partial sums and write back result
 								    tmp = warp_reduce_sum(tmp);
 								    if (threadIdx.x == 0) {
 								        dst[idst] = tmp;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    float * dsti = (float *) cdsti;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *dsti = *xi;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    half * dsti = (half *) cdsti;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *dsti = __float2half(*xi);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
 								    const half * xi = (const half *) cxi;
 								    half * dsti = (half *) cdsti;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *dsti = *xi;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 								    const half * xi = (const half *) cxi;
 								    float * dsti = (float *) cdsti;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *dsti = *xi;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <cpy_kernel_t cpy_1>
 								static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 								                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
 								                                   const int nb12, const int nb13) {
 								    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= ne) {
 								        return;
 								    }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 19:56:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
 								    // then combine those indices with the corresponding byte offsets to get the total offsets
 								    const int64_t i03 = i/(ne00 * ne01 * ne02);
 								    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
 								    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
 								    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
 								    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
 								    const int64_t i13 = i/(ne10 * ne11 * ne12);
 								    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
 								    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
 								    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
 								    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
 								    cpy_1(cx + x_offset, cdst + dst_offset);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float amax = 0.0f; // absolute max
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 19:39:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int j = 0; j < QK8_0; j++) {
 								        const float v = xi[j];
 								        amax = fmaxf(amax, fabsf(v));
 								    }
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 20:56:40 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d = amax / ((1 << 7) - 1);
 								    const float id = d ? 1.0f/d : 0.0f;
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 20:56:40 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dsti->d = d;
 								    for (int j = 0; j < QK8_0; ++j) {
 								        const float x0 = xi[j]*id;
 								        dsti->qs[j] = roundf(x0);
 								    }
 								}
 								static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q4_0 * dsti = (block_q4_0 *) cdsti;
 								    float amax = 0.0f;
 								    float vmax = 0.0f;
 								    for (int j = 0; j < QK4_0; ++j) {
 								        const float v = xi[j];
 								        if (amax < fabsf(v)) {
 								            amax = fabsf(v);
 								            vmax = v;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
+								        }
 								    }
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 20:56:40 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d  = vmax / -8;
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->d = d;
 								    for (int j = 0; j < QK4_0/2; ++j) {
 								        const float x0 = xi[0       + j]*id;
 								        const float x1 = xi[QK4_0/2 + j]*id;
 								        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
 								        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
 								        dsti->qs[j]  = xi0;
 								        dsti->qs[j] |= xi1 << 4;
 								    }
 								}
 								static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q4_1 * dsti = (block_q4_1 *) cdsti;
 								    float vmin = FLT_MAX;
 								    float vmax = -FLT_MAX;
 								    for (int j = 0; j < QK4_1; ++j) {
 								        const float v = xi[j];
 								        if (v < vmin) vmin = v;
 								        if (v > vmax) vmax = v;
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 13:44:06 +00:00
+								    }
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 13:14:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float d  = (vmax - vmin) / ((1 << 4) - 1);
 								    const float id = d ? 1.0f/d : 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dsti->dm.x = d;
 								    dsti->dm.y = vmin;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int j = 0; j < QK4_1/2; ++j) {
 								        const float x0 = (xi[0       + j] - vmin)*id;
 								        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
 								        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        dsti->qs[j]  = xi0;
 								        dsti->qs[j] |= xi1 << 4;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												Add ability to use Q5_0, Q5_1, and IQ4_NL for quantized K cache (#6183)

* k_cache: be able to use Q5_0

* k_cache: be able to use Q5_1 on CODA

* k_cache: be able to use Q5_0 on Metal

* k_cache: be able to use Q5_1 on Metal

* k_cache: be able to use IQ4_NL - just CUDA for now

* k_cache: be able to use IQ4_NL on Metal

* k_cache: add newly added supported types to llama-bench and CUDA supports_op

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-21 07:27:57 +00:00
+								static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q5_0 * dsti = (block_q5_0 *) cdsti;
 								    float amax = 0.0f;
 								    float vmax = 0.0f;
 								    for (int j = 0; j < QK5_0; ++j) {
 								        const float v = xi[j];
 								        if (amax < fabsf(v)) {
 								            amax = fabsf(v);
 								            vmax = v;
 								        }
 								    }
 								    const float d  = vmax / -16;
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->d = d;
 								    uint32_t qh = 0;
 								    for (int j = 0; j < QK5_0/2; ++j) {
 								        const float x0 = xi[0       + j]*id;
 								        const float x1 = xi[QK5_0/2 + j]*id;
 								        const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
 								        const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
 								        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 								        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 								        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
 								    }
 								    memcpy(dsti->qh, &qh, sizeof(qh));
 								}
 								static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q5_1 * dsti = (block_q5_1 *) cdsti;
 								    float min = xi[0];
 								    float max = xi[0];
 								    for (int j = 1; j < QK5_1; ++j) {
 								        const float v = xi[j];
 								        min = v < min ? v : min;
 								        max = v > max ? v : max;
 								    }
 								    const float d  = (max - min) / 31;
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->dm.x = d;
 								    dsti->dm.y = min;
 								    uint32_t qh = 0;
 								    for (int j = 0; j < QK5_1/2; ++j) {
 								        const float x0 = (xi[0       + j] - min)*id;
 								        const float x1 = (xi[QK5_1/2 + j] - min)*id;
 								        const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
 								        const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
 								        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 								        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 								        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
 								    }
 								    memcpy(dsti->qh, &qh, sizeof(qh));
 								}
 								static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
 								    if (x <= val[0]) return 0;
 								    if (x >= val[n-1]) return n-1;
 								    int ml = 0, mu = n-1;
 								    while (mu-ml > 1) {
 								        int mav = (ml+mu)/2;
 								        if (x < val[mav]) mu = mav; else ml = mav;
 								    }
 								    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
 								}
 								static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
 								    float amax = 0.0f;
 								    float vmax = 0.0f;
 								    for (int j = 0; j < QK4_NL; ++j) {
 								        const float v = xi[j];
 								        if (amax < fabsf(v)) {
 								            amax = fabsf(v);
 								            vmax = v;
 								        }
 								    }
 								    float d = vmax / kvalues_iq4nl[0];
 								    const float id = d ? 1.0f/d : 0.0f;
 								    float sumqx = 0, sumq2 = 0;
 								    for (int j = 0; j < QK4_NL/2; ++j) {
 								        const float x0 = xi[0        + j]*id;
 								        const float x1 = xi[QK4_NL/2 + j]*id;
 								        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
 								        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
 								        dsti->qs[j] = xi0 | (xi1 << 4);
 								        const float v0 = kvalues_iq4nl[xi0];
 								        const float v1 = kvalues_iq4nl[xi1];
 								        const float w0 = xi[0        + j]*xi[0        + j];
 								        const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
 								        sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
 								        sumq2 += w0*v0*v0 + w1*v1*v1;
 								    }
 								    dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <cpy_kernel_t cpy_blck, int qk>
 								static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
 								                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
 								                                 const int nb12, const int nb13) {
 								    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= ne) {
 								        return;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i03 = i/(ne00 * ne01 * ne02);
 								    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
 								    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
 								    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
 								    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i13 = i/(ne10 * ne11 * ne12);
 								    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
 								    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
 								    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
 								    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
 								    cpy_blck(cx + x_offset, cdst + dst_offset);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
 								    const float y = (i0 / 2 - low) / max(0.001f, high - low);
 								    return 1.0f - min(1.0f, max(0.0f, y));
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								struct rope_corr_dims {
 								    float v[4];
 								};
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 								// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 								static __device__ void rope_yarn(
 								    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
 								    float * cos_theta, float * sin_theta
 								) {
 								    // Get n-d rotational scaling corrected for extrapolation
 								    float theta_interp = freq_scale * theta_extrap;
 								    float theta = theta_interp;
 								    if (ext_factor != 0.0f) {
 								        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
 								        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 								        // Get n-d magnitude scaling corrected for interpolation
 								        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    *cos_theta = cosf(theta) * mscale;
 								    *sin_theta = sinf(theta) * mscale;
 								}
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// rope == RoPE == rotary positional embedding
 								template<typename T, bool has_pos>
 								static __global__ void rope(
 								    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
 								    float ext_factor, float attn_factor, rope_corr_dims corr_dims
 								) {
 								    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (col >= ncols) {
 								        return;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int i = row*ncols + col;
 								    const int i2 = row/p_delta_rows;
 								    const int p = has_pos ? pos[i2] : 0;
 								    const float theta_base = p*powf(freq_base, -float(col)/ncols);
 								    float cos_theta, sin_theta;
 								    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
 								    const float x0 = x[i + 0];
 								    const float x1 = x[i + 1];
 								    dst[i + 0] = x0*cos_theta - x1*sin_theta;
 								    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T, bool has_pos>
 								static __global__ void rope_neox(
 								    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
 								) {
 								    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (col >= ncols) {
 								        return;
 								    }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int ib = col / n_dims;
 								    const int ic = col % n_dims;
 								    if (ib > 0) {
 								        const int i = row*ncols + ib*n_dims + ic;
 								        dst[i + 0] = x[i + 0];
 								        dst[i + 1] = x[i + 1];
 								        return;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int i  = row*ncols + ib*n_dims + ic/2;
 								    const int i2 = row/p_delta_rows;
 								    float cur_rot = inv_ndims * ic - ib;
 								    const int p = has_pos ? pos[i2] : 0;
 								    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
 								    float cos_theta, sin_theta;
 								    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 								    const float x0 = x[i + 0];
 								    const float x1 = x[i + n_dims/2];
 								    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
 								    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 								}
 								static __global__ void rope_glm_f32(
 								    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
 								    int n_ctx
 								) {
 								    const int col = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int half_n_dims = ncols/4;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (col >= half_n_dims) {
 								        return;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i = row*ncols + col;
 								    const int i2 = row/p_delta_rows;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
 								     // FIXME: this is likely wrong
 								    const int p = pos != nullptr ? pos[i2] : 0;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
 								    const float sin_theta = sinf(theta);
 								    const float cos_theta = cosf(theta);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float x0 = x[i + 0];
 								    const float x1 = x[i + half_n_dims];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
 								    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
 								    const float sin_block_theta = sinf(block_theta);
 								    const float cos_block_theta = cosf(block_theta);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float x2 = x[i + half_n_dims * 2];
 								    const float x3 = x[i + half_n_dims * 3];
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
 								    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
 								}
 								static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
 								                                 const int n_heads_log2_floor, const float m0, const float m1) {
 								    const int col = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (col >= ncols) {
 								        return;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int row = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i = row*ncols + col;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int k = row/k_rows;
 								    float m_k;
 								    if (k < n_heads_log2_floor) {
 								        m_k = powf(m0, k + 1);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst[i] = col * m_k + x[i];
 								}
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 12:19:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
 								    const int row = blockIdx.x;
 								    const int col = threadIdx.x;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float sum = 0.0f;
 								    for (int i = col; i < ncols; i += blockDim.x) {
 								        sum += x[row * ncols + i];
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    sum = warp_reduce_sum(sum);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (col == 0) {
 								        dst[row] = sum;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T>
-												cuda : fix conflict with std::swap (#6186)


											
										
										
											2024-03-21 00:47:46 +00:00
+								static inline __device__ void ggml_cuda_swap(T & a, T & b) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    T tmp = a;
 								    a = b;
 								    b = tmp;
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<ggml_sort_order order>
 								static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
 								    // bitonic sort
 								    int col = threadIdx.x;
 								    int row = blockIdx.y;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (col >= ncols) return;
 								    const float * x_row = x + row * ncols;
 								    int * dst_row = dst + row * ncols;
 								    // initialize indices
 								    if (col < ncols) {
 								        dst_row[col] = col;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    __syncthreads();
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int k = 2; k <= ncols; k *= 2) {
 								        for (int j = k / 2; j > 0; j /= 2) {
 								            int ixj = col ^ j;
 								            if (ixj > col) {
 								                if ((col & k) == 0) {
 								                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-												cuda : fix conflict with std::swap (#6186)


											
										
										
											2024-03-21 00:47:46 +00:00
+								                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                    }
 								                } else {
 								                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-												cuda : fix conflict with std::swap (#6186)


											
										
										
											2024-03-21 00:47:46 +00:00
+								                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                    }
 								                }
 								            }
 								            __syncthreads();
 								        }
 								    }
 								}
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
 								    const int col = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (col >= ncols) {
 								        return;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 14:48:10 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    const int i = row*ncols + col;
 								    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
 								    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
 								    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <bool vals_smem, int ncols_template, int block_size_template>
 								static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
 								    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int tid  = threadIdx.x;
 								    const int rowx = blockIdx.x;
 								    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int warp_id = threadIdx.x / WARP_SIZE;
 								    const int lane_id = threadIdx.x % WARP_SIZE;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float slope = 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // ALiBi
 								    if (max_bias > 0.0f) {
 								        const int h = rowx/nrows_y; // head index
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float base = h < n_head_log2 ? m0 : m1;
 								        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        slope = powf(base, exp);
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    extern __shared__ float data_soft_max_f32[];
 								    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
 								    // shared memory buffer to cache values between iterations:
 								    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-12 22:24:45 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float max_val = -INFINITY;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 12:09:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (ncols_template == 0 && col >= ncols) {
 								            break;
 								        }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int ix = rowx*ncols + col;
 								        const int iy = rowy*ncols + col;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        vals[col] = val;
 								        max_val = max(max_val, val);
 								    }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // find the max value in the block
 								    max_val = warp_reduce_max(max_val);
 								    if (block_size > WARP_SIZE) {
 								        if (warp_id == 0) {
 								            buf_iw[lane_id] = -INFINITY;
 								        }
 								        __syncthreads();
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (lane_id == 0) {
 								            buf_iw[warp_id] = max_val;
 								        }
 								        __syncthreads();
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        max_val = buf_iw[lane_id];
 								        max_val = warp_reduce_max(max_val);
 								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float tmp = 0.0f; // partial sum
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (ncols_template == 0 && col >= ncols) {
 								            break;
 								        }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float val = expf(vals[col] - max_val);
 								        tmp += val;
 								        vals[col] = val;
 								    }
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // find the sum of exps in the block
 								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __syncthreads();
 								        if (warp_id == 0) {
 								            buf_iw[lane_id] = 0.0f;
 								        }
 								        __syncthreads();
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (lane_id == 0) {
 								            buf_iw[warp_id] = tmp;
 								        }
 								        __syncthreads();
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        tmp = buf_iw[lane_id];
 								        tmp = warp_reduce_sum(tmp);
 								    }
 								    const float inv_sum = 1.0f / tmp;
 								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
 								        if (ncols_template == 0 && col >= ncols) {
 								            return;
 								        }
 								        const int idst = rowx*ncols + col;
 								        dst[idst] = vals[col] * inv_sum;
 								    }
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = scale * x[i];
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 11:03:17 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <typename T>
 								static  __global__ void im2col_kernel(
 								        const float * x, T * dst, int64_t batch_offset,
 								        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
 								        int s0, int s1, int p0, int p1, int d0, int d1) {
 								    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (i >= pelements) {
 								        return;
 								    }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
 								    const int64_t  kx = i / ksize;
 								    const int64_t  kd = kx * ksize;
 								    const int64_t  ky = (i - kd) / OW;
 								    const int64_t  ix = i % OW;
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t  oh = blockIdx.y;
 								    const int64_t  batch = blockIdx.z / IC;
 								    const int64_t  ic = blockIdx.z % IC;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t iiw = ix * s0 + kx * d0 - p0;
 								    const int64_t iih = oh * s1 + ky * d1 - p1;
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t offset_dst =
 								        ((batch * OH + oh) * OW + ix) * CHW +
 								        (ic * (KW * KH) + ky * KW + kx);
 								    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
 								        dst[offset_dst] = 0.0f;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
 								        dst[offset_dst] = x[offset_src + iih * IW + iiw];
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
+								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 13:38:36 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <typename Ti, typename To>
 								static  __global__ void pool2d_nchw_kernel(
 								        const int ih, const int iw, const int oh, const int ow,
 								        const int kh, const int kw, const int sh, const int sw,
 								        const int ph, const int pw, const int parallel_elements,
 								        const Ti* src, To* dst, const enum ggml_op_pool op) {
 								    int idx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (idx >= parallel_elements) {
 								        return;
 								    }
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 17:04:31 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int I_HW = ih * iw;
 								    const int O_HW = oh * ow;
 								    const int nc = idx / O_HW;
 								    const int cur_oh = idx % O_HW / ow;
 								    const int cur_ow = idx % O_HW % ow;
 								    const Ti* i_ptr = src + nc * I_HW;
 								    To* o_ptr = dst + nc * O_HW;
 								    const int start_h = cur_oh * sh - ph;
 								    const int bh = max(0, start_h);
 								    const int eh = min(ih, start_h + kh);
 								    const int start_w = cur_ow * sw - pw;
 								    const int bw = max(0, start_w);
 								    const int ew = min(iw, start_w + kw);
 								    const To scale = 1. / (kh * kw);
 								    To res = 0;
 								    switch (op) {
 								        case GGML_OP_POOL_AVG: res = 0; break;
 								        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
 								        default: assert(false);
 								    }
 								    for (int i = bh; i < eh; i += 1) {
 								        for (int j = bw; j < ew; j += 1) {
 								#if __CUDA_ARCH__ >= 350
 								            Ti cur = __ldg(i_ptr + i * iw + j);
 								#else
 								            Ti cur = i_ptr[i * iw + j];
 								#endif
 								            switch (op) {
 								                case GGML_OP_POOL_AVG: res += cur * scale; break;
 								                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
 								                default: assert(false);
 								            }
 								        }
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    o_ptr[cur_oh * ow + cur_ow] = res;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 08:55:59 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<int qk, int qr, dequantize_kernel_t dq>
 								static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
 								    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
 								    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // strides in elements
 								    //const size_t s0 = nb0 / ggml_element_size(dst);
 								    const size_t s1 = nb1 / ggml_element_size(dst);
 								    const size_t s2 = nb2 / ggml_element_size(dst);
 								    const size_t s3 = nb3 / ggml_element_size(dst);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t s10 = nb10 / ggml_element_size(src1);
 								    const size_t s11 = nb11 / ggml_element_size(src1);
 								    const size_t s12 = nb12 / ggml_element_size(src1);
 								    //const size_t s13 = nb13 / ggml_element_size(src1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne00 % 2 == 0);
 								    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
 								            src0_dd, src1_dd, dst_dd,
 								            ne00, /*ne01, ne02, ne03,*/
 								            /*ne10, ne11,*/ ne12, /*ne13,*/
 								            /* s0,*/ s1, s2, s3,
 								            /* nb00,*/ nb01, nb02, nb03,
 								            s10, s11, s12/*, s13*/);
 								    GGML_UNUSED(dst);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename src0_t>
 								static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 21:04:16 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 21:04:16 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
 								    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
 								    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 21:04:16 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // strides in elements
 								    //const size_t s0 = nb0 / ggml_element_size(dst);
 								    const size_t s1 = nb1 / ggml_element_size(dst);
 								    const size_t s2 = nb2 / ggml_element_size(dst);
 								    const size_t s3 = nb3 / ggml_element_size(dst);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t s10 = nb10 / ggml_element_size(src1);
 								    const size_t s11 = nb11 / ggml_element_size(src1);
 								    const size_t s12 = nb12 / ggml_element_size(src1);
 								    //const size_t s13 = nb13 / ggml_element_size(src1);
 								    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
 								            src0_dd, src1_dd, dst_dd,
 								            ne00, /*ne01, ne02, ne03,*/
 								            /*ne10, ne11,*/ ne12, /*ne13,*/
 								            /* s0,*/ s1, s2, s3,
 								            /* nb00,*/ nb01, nb02, nb03,
 								            s10, s11, s12/*, s13*/);
 								    GGML_UNUSED(dst);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<float (*bin_op)(const float, const float)>
 								struct bin_bcast_cuda {
 								    template<typename src0_t, typename src1_t, typename dst_t>
 								    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
 								            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
 								            cudaStream_t stream) {
 								        GGML_TENSOR_BINARY_OP_LOCALS
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int nr0 = ne10/ne0;
 								        int nr1 = ne11/ne1;
 								        int nr2 = ne12/ne2;
 								        int nr3 = ne13/ne3;
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int nr[4] = { nr0, nr1, nr2, nr3 };
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        // collapse dimensions until first broadcast dimension
 								        int64_t cne0[] = {ne0, ne1, ne2, ne3};
 								        int64_t cne1[] = {ne10, ne11, ne12, ne13};
 								        size_t cnb0[] = {nb0, nb1, nb2, nb3};
 								        size_t cnb1[] = {nb10, nb11, nb12, nb13};
 								        auto collapse = [](int64_t cne[]) {
 								            cne[0] *= cne[1];
 								            cne[1] = cne[2];
 								            cne[2] = cne[3];
 								            cne[3] = 1;
 								        };
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
 								            cnb[1] *= cne[1];
 								            cnb[2] *= cne[2];
 								            cnb[3] *= cne[3];
 								        };
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int i = 0; i < 4; i++) {
 								            if (nr[i] != 1) {
 								                break;
 								            }
 								            if (i > 0) {
 								                collapse_nb(cnb0, cne0);
 								                collapse_nb(cnb1, cne1);
 								                collapse(cne0);
 								                collapse(cne1);
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 14:27:51 +00:00
+								            }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        {
 								            int64_t ne0 = cne0[0];
 								            int64_t ne1 = cne0[1];
 								            int64_t ne2 = cne0[2];
 								            int64_t ne3 = cne0[3];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            int64_t ne10 = cne1[0];
 								            int64_t ne11 = cne1[1];
 								            int64_t ne12 = cne1[2];
 								            int64_t ne13 = cne1[3];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            size_t nb0 = cnb0[0];
 								            size_t nb1 = cnb0[1];
 								            size_t nb2 = cnb0[2];
 								            size_t nb3 = cnb0[3];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            size_t nb10 = cnb1[0];
 								            size_t nb11 = cnb1[1];
 								            size_t nb12 = cnb1[2];
 								            size_t nb13 = cnb1[3];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            size_t s0 = nb0 / sizeof(dst_t);
 								            size_t s1 = nb1 / sizeof(dst_t);
 								            size_t s2 = nb2 / sizeof(dst_t);
 								            size_t s3 = nb3 / sizeof(dst_t);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            size_t s10 = nb10 / sizeof(src1_t);
 								            size_t s11 = nb11 / sizeof(src1_t);
 								            size_t s12 = nb12 / sizeof(src1_t);
 								            size_t s13 = nb13 / sizeof(src1_t);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            GGML_ASSERT(s0 == 1);
 								            GGML_ASSERT(s10 == 1);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            const int block_size = 128;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            int64_t hne0 = std::max(ne0/2LL, 1LL);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            dim3 block_dims;
 								            block_dims.x = std::min<unsigned int>(hne0, block_size);
 								            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
 								            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            dim3 block_nums(
 								                (hne0 + block_dims.x - 1) / block_dims.x,
 								                (ne1 + block_dims.y - 1) / block_dims.y,
 								                (ne2*ne3 + block_dims.z - 1) / block_dims.z
 								            );
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            if (block_nums.z > 65535) {
 								                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
 								                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
 								                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
 								                    src0_dd, src1_dd, dst_dd,
 								                    ne0, ne1, ne2, ne3,
 								                    ne10, ne11, ne12, ne13,
 								                    /* s0, */ s1, s2, s3,
 								                    /* s10, */ s11, s12, s13);
 								            } else {
 								                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
 								                    src0_dd, src1_dd, dst_dd,
 								                    ne0, ne1, ne2, ne3,
 								                    ne10, ne11, ne12, ne13,
 								                    /* s0, */ s1, s2, s3,
 								                    /* s10, */ s11, s12, s13);
 								            }
 								        }
 								    }
 								};
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
 								    const int ne10, const int ne11, const int ne12,
 								    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
 								    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
 								    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
 								    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
 								    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
 								    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
 								    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
 								    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
 								    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
 								    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
 								    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
 								    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % WARP_SIZE == 0);
 								    if (ncols < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
 								        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const dim3 block_dims(1024, 1, 1);
 								        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
 								    static const float eps = 1e-6f;
 								    if (group_size < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
 								        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const dim3 block_dims(1024, 1, 1);
 								        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
+								    }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
+								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
 								    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, ne1, ne2);
 								    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
 								                             const int scale_factor, cudaStream_t stream) {
 								    int ne0 = (ne00 * scale_factor);
 								    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
 								    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void pad_f32_cuda(const float * x, float * dst,
 								    const int ne00, const int ne01, const int ne02, const int ne03,
 								    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
 								    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, ne1, ne2*ne3);
 								    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
 								    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
 								    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
 								}
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 19:59:17 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
 								                                        const int dim, const int max_period, cudaStream_t stream) {
 								    int half_ceil = (dim + 1) / 2;
 								    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, ne00, 1);
 								    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
 								}
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 06:49:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % WARP_SIZE == 0);
 								    if (ncols < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
 								        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 								    } else {
 								        const dim3 block_dims(1024, 1, 1);
 								        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 								    }
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
 								    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
 								    const dim3 num_blocks(block_num_x, ky, 1);
 								    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
 								    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
 								}
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 								static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
 								    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 								}
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 09:09:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
 								    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
 								        const bool need_check = false;
 								        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
 								    } else {
 								        const bool need_check = true;
 								        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
 								    }
 								}
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 06:49:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								#if QK_K == 256
 								    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
+								#endif
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
 								template<typename dst_t>
 								static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								#if QK_K == 256
 								    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
+								#endif
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb32 = k / 32;
 								    const int nb = (k + 255) / 256;
 								    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb32 = k / 32;
 								    const int nb = (k + 255) / 256;
 								    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
 								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								#if QK_K == 256
 								    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 09:20:24 +00:00
+								#else
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								#if QK_K == 256
 								    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
 								#else
 								    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
 								}
-												cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
											
										
										
											2023-04-29 00:04:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 01:14:14 +00:00
+								}
-												cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
											
										
										
											2023-04-28 23:31:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = (k + QK_K - 1) / QK_K;
 								    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename dst_t>
 								static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = (k + QK_K - 1) / QK_K;
 								#if QK_K == 64
 								    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 								#else
 								    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <typename src_t, typename dst_t>
 								static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
 								    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 								}
-												cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
											
										
										
											2023-04-28 23:31:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
 								    int id;
 								    switch (type) {
 								        case GGML_TYPE_Q4_0:
 								            return dequantize_row_q4_0_cuda;
 								        case GGML_TYPE_Q4_1:
 								            return dequantize_row_q4_1_cuda;
 								        case GGML_TYPE_Q5_0:
 								            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
 								        case GGML_TYPE_Q5_1:
 								            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
 								        case GGML_TYPE_Q8_0:
 								            CUDA_CHECK(cudaGetDevice(&id));
 								            if (get_cuda_global_info().devices[id].cc >= CC_PASCAL) {
 								                return dequantize_block_q8_0_f16_cuda;
 								            }
 								            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
 								        case GGML_TYPE_Q2_K:
 								            return dequantize_row_q2_K_cuda;
 								        case GGML_TYPE_Q3_K:
 								            return dequantize_row_q3_K_cuda;
 								        case GGML_TYPE_Q4_K:
 								            return dequantize_row_q4_K_cuda;
 								        case GGML_TYPE_Q5_K:
 								            return dequantize_row_q5_K_cuda;
 								        case GGML_TYPE_Q6_K:
 								            return dequantize_row_q6_K_cuda;
 								        case GGML_TYPE_IQ2_XXS:
 								            return dequantize_row_iq2_xxs_cuda;
 								        case GGML_TYPE_IQ2_XS:
 								            return dequantize_row_iq2_xs_cuda;
 								        case GGML_TYPE_IQ2_S:
 								            return dequantize_row_iq2_s_cuda;
 								        case GGML_TYPE_IQ3_XXS:
 								            return dequantize_row_iq3_xxs_cuda;
 								        case GGML_TYPE_IQ1_S:
 								            return dequantize_row_iq1_s_cuda;
 								        case GGML_TYPE_IQ4_NL:
 								            return dequantize_row_iq4_nl_cuda;
 								        case GGML_TYPE_IQ4_XS:
 								            return dequantize_row_iq4_xs_cuda;
 								        case GGML_TYPE_IQ3_S:
 								            return dequantize_row_iq3_s_cuda;
 								        case GGML_TYPE_F32:
 								            return convert_unary_cuda<float>;
 								        default:
 								            return nullptr;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 								    switch (type) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        case GGML_TYPE_Q4_0:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return dequantize_row_q4_0_cuda;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        case GGML_TYPE_Q4_1:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return dequantize_row_q4_1_cuda;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        case GGML_TYPE_Q5_0:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        case GGML_TYPE_Q5_1:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        case GGML_TYPE_Q8_0:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
 								        case GGML_TYPE_Q2_K:
 								            return dequantize_row_q2_K_cuda;
 								        case GGML_TYPE_Q3_K:
 								            return dequantize_row_q3_K_cuda;
 								        case GGML_TYPE_Q4_K:
 								            return dequantize_row_q4_K_cuda;
 								        case GGML_TYPE_Q5_K:
 								            return dequantize_row_q5_K_cuda;
 								        case GGML_TYPE_Q6_K:
 								            return dequantize_row_q6_K_cuda;
 								        case GGML_TYPE_IQ2_XXS:
 								            return dequantize_row_iq2_xxs_cuda;
 								        case GGML_TYPE_IQ2_XS:
 								            return dequantize_row_iq2_xs_cuda;
 								        case GGML_TYPE_IQ2_S:
 								            return dequantize_row_iq2_s_cuda;
 								        case GGML_TYPE_IQ3_XXS:
 								            return dequantize_row_iq3_xxs_cuda;
 								        case GGML_TYPE_IQ1_S:
 								            return dequantize_row_iq1_s_cuda;
 								        case GGML_TYPE_IQ4_NL:
 								            return dequantize_row_iq4_nl_cuda;
 								        case GGML_TYPE_IQ4_XS:
 								            return dequantize_row_iq4_xs_cuda;
 								        case GGML_TYPE_IQ3_S:
 								            return dequantize_row_iq3_s_cuda;
 								        case GGML_TYPE_F16:
 								            return convert_unary_cuda<half>;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        default:
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            return nullptr;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : support broadcast add & mul (#2192)

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-07-14 18:38:24 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % QK_K == 0);
 								    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
 								    const int block_num_y = (nrows + ny - 1) / ny;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % QK_K == 0);
 								    const int ny = 2 / K_QUANTS_PER_ITERATION;
 								    const int block_num_y = (nrows + ny - 1) / ny;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % QK_K == 0);
 								    const int ny = 2 / K_QUANTS_PER_ITERATION;
 								    const int block_num_y = (nrows + ny - 1) / ny;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % QK_K == 0);
 								    const dim3 block_dims(32, 1, 1);
 								    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % QK_K == 0);
 								    const int ny = 2 / K_QUANTS_PER_ITERATION;
 								    const int block_num_y = (nrows + ny - 1) / ny;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<1, 1, convert_f16>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
 								static void mul_mat_vec_q_cuda(
 								    const void * vx, const void * vy, float * dst,
 								    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ncols_x % qk == 0);
 								    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t nwarps = 1;
 								    int64_t rows_per_cuda_block = 1;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (get_cuda_global_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
 								        switch(ncols_y) {
 								            case 1:
 								                nwarps = 4;
 								                rows_per_cuda_block = 1;
 								                break;
 								            case 2:
 								            case 3:
 								            case 4:
 								                nwarps = 4;
 								                rows_per_cuda_block = 2;
 								                break;
 								            case 5:
 								            case 6:
 								            case 7:
 								            case 8:
 								                nwarps = 2;
 								                rows_per_cuda_block = 2;
 								                break;
 								            default:
 								                GGML_ASSERT(false);
 								                break;
 								        }
 								    }
 								    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
 								    const dim3 block_nums(nblocks, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (ncols_y) {
 								        case 1:
 								            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 2:
 								            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 3:
 								            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 4:
 								            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 5:
 								            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 6:
 								            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 7:
 								            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 8:
 								            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q4_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
 								        nwarps = NWARPS_Q4_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
 								        nwarps = NWARPS_Q4_0_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q4_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
 								        nwarps = NWARPS_Q4_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
 								        nwarps = NWARPS_Q4_0_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q4_1_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_1_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
 								        nwarps = NWARPS_Q4_1_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_1_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
 								        nwarps = NWARPS_Q4_1_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q4_1_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
 								        nwarps = NWARPS_Q4_1_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_1_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
 								        nwarps = NWARPS_Q4_1_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q5_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
 								        nwarps = NWARPS_Q5_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
 								        nwarps = NWARPS_Q5_0_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q5_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
 								        nwarps = NWARPS_Q5_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
 								        nwarps = NWARPS_Q5_0_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								    }
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q5_1_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_1_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
 								        nwarps = NWARPS_Q5_1_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_1_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
 								        nwarps = NWARPS_Q5_1_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q5_1_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
 								        nwarps = NWARPS_Q5_1_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_1_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
 								        nwarps = NWARPS_Q5_1_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q8_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q8_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
 								        nwarps = NWARPS_Q8_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q8_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
 								        nwarps = NWARPS_Q8_0_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q8_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
 								        nwarps = NWARPS_Q8_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q8_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
 								        nwarps = NWARPS_Q8_0_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q2_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q2_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
 								        nwarps = NWARPS_Q2_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q2_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
 								        nwarps = NWARPS_Q2_K_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q2_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
 								        nwarps = NWARPS_Q2_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q2_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
 								        nwarps = NWARPS_Q2_K_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q3_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if QK_K == 256
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q3_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
 								        nwarps = NWARPS_Q3_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q3_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
 								        nwarps = NWARPS_Q3_K_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q3_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
 								        nwarps = NWARPS_Q3_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q3_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
 								        nwarps = NWARPS_Q3_K_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
 								#endif
 								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q4_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
 								        nwarps = NWARPS_Q4_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
 								        nwarps = NWARPS_Q4_K_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q4_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
 								        nwarps = NWARPS_Q4_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
 								        nwarps = NWARPS_Q4_K_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												make rms_norm_eps a parameter (#2374)

* make rms_norm_eps a parameter

* add rms_norm_eps to command line

* fix baby llama, test-grad0

* use scientific notation for eps param in the help

ggml-ci
											
										
										
											2023-07-24 15:57:12 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_q5_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
 								        nwarps = NWARPS_Q5_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
 								        nwarps = NWARPS_Q5_K_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q5_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
 								        nwarps = NWARPS_Q5_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
 								        nwarps = NWARPS_Q5_K_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    }
 								}
 								static void ggml_mul_mat_q6_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
 								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int mmq_x, mmq_y, nwarps;
 								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q6_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
 								        nwarps = NWARPS_Q6_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q6_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
 								        nwarps = NWARPS_Q6_K_RDNA1;
 								    } else if (compute_capability >= CC_VOLTA) {
 								        mmq_x  =  MMQ_X_Q6_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
 								        nwarps = NWARPS_Q6_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q6_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
 								        nwarps = NWARPS_Q6_K_PASCAL;
 								    } else {
 								        GGML_ASSERT(false);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 21:04:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_p021_f16_f32_cuda(
 								    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
 								    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
 								    const dim3 block_nums(1, nrows_x, nchannels_y);
 								    const dim3 block_dims(WARP_SIZE, 1, 1);
 								    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 07:42:34 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_mul_mat_vec_nc_f16_f32_cuda(
 								    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
 								    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dim3 block_nums(1, nrows_x, nchannels_y);
 								    const dim3 block_dims(WARP_SIZE, 1, 1);
 								    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
 								        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								static void ggml_cpy_f16_f32_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cpy_f32_f32_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cpy_f32_f16_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 11:40:26 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 11:40:26 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cpy_f32_q8_0_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 11:40:26 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne % QK8_0 == 0);
 								    const int num_blocks = ne / QK8_0;
 								    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 11:40:26 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cpy_f32_q4_0_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    GGML_ASSERT(ne % QK4_0 == 0);
 								    const int num_blocks = ne / QK4_0;
 								    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
 								static void ggml_cpy_f32_q4_1_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 08:23:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne % QK4_1 == 0);
 								    const int num_blocks = ne / QK4_1;
 								    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												Add ability to use Q5_0, Q5_1, and IQ4_NL for quantized K cache (#6183)

* k_cache: be able to use Q5_0

* k_cache: be able to use Q5_1 on CODA

* k_cache: be able to use Q5_0 on Metal

* k_cache: be able to use Q5_1 on Metal

* k_cache: be able to use IQ4_NL - just CUDA for now

* k_cache: be able to use IQ4_NL on Metal

* k_cache: add newly added supported types to llama-bench and CUDA supports_op

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-21 07:27:57 +00:00
+								static void ggml_cpy_f32_q5_0_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    GGML_ASSERT(ne % QK5_0 == 0);
 								    const int num_blocks = ne / QK5_0;
 								    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
 								static void ggml_cpy_f32_q5_1_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    GGML_ASSERT(ne % QK5_1 == 0);
 								    const int num_blocks = ne / QK5_1;
 								    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
 								static void ggml_cpy_f32_iq4_nl_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    GGML_ASSERT(ne % QK4_NL == 0);
 								    const int num_blocks = ne / QK4_NL;
 								    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cpy_f16_f16_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 12:19:42 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 08:32:31 +00:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
 								    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 								}
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
 								    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
 								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 08:23:56 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T>
 								static void rope_cuda(
 								    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 								) {
 								    GGML_ASSERT(ncols % 2 == 0);
 								    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
 								    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
 								    const dim3 block_nums(nrows, num_blocks_x, 1);
 								    if (pos == nullptr) {
 								        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
 								        );
 								    } else {
 								        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
 								        );
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<typename T>
 								static void rope_neox_cuda(
 								    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 								) {
 								    GGML_ASSERT(ncols % 2 == 0);
 								    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
 								    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
 								    const dim3 block_nums(nrows, num_blocks_x, 1);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 								    const float inv_ndims = -1.0f / n_dims;
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (pos == nullptr) {
 								        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
 								            theta_scale, inv_ndims
 								        );
 								    } else {
 								        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
 								            theta_scale, inv_ndims
 								        );
 								    }
 								}
-												batched-bench : print params at start

											
										
										
											2023-10-25 07:26:27 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void rope_glm_f32_cuda(
 								    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float freq_base, int n_ctx, cudaStream_t stream
 								) {
 								    GGML_ASSERT(ncols % 4 == 0);
 								    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
 								    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
 								    const dim3 block_nums(num_blocks_x, nrows, 1);
 								    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
 								}
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
 								                           const int k_rows, const int n_heads_log2_floor, const float m0,
 								                           const float m1, cudaStream_t stream) {
 								    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
 								    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
 								    const dim3 block_nums(num_blocks_x, nrows, 1);
 								    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    const dim3 block_dims(WARP_SIZE, 1, 1);
 								    const dim3 block_nums(nrows, 1, 1);
 								    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
 								    // bitonic sort requires ncols to be power of 2
 								    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const dim3 block_dims(ncols, 1, 1);
 								    const dim3 block_nums(1, nrows, 1);
 								    if (order == GGML_SORT_ORDER_ASC) {
 								        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 								    } else if (order == GGML_SORT_ORDER_DESC) {
 								        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 								    } else {
 								        GGML_ASSERT(false);
 								    }
 								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 16:12:57 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
 								    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
 								    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
 								    const dim3 block_nums(nrows_x, block_num_x, 1);
 								    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
 								}
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
 								    int nth = WARP_SIZE;
 								    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
 								    const dim3 block_dims(nth,     1, 1);
 								    const dim3 block_nums(nrows_x, 1, 1);
 								    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
 								    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const uint32_t n_head_kv   = nrows_x/nrows_y;
 								    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 10:08:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (shmem < get_cuda_global_info().devices[ggml_cuda_get_device()].smpb) {
 								        switch (ncols_x) {
 								            case 32:
 								                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 64:
 								                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 128:
 								                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 256:
 								                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 512:
 								                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 1024:
 								                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 2048:
 								                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            case 4096:
 								                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
 								            default:
 								                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 								                break;
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 08:32:31 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    } else {
 								        const size_t shmem_low = WARP_SIZE*sizeof(float);
 								        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: fix LoRAs (#3130)


											
										
										
											2023-09-12 22:15:33 +00:00
+								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template <typename T>
 								static void im2col_cuda(const float * x, T* dst,
 								    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
 								    int64_t batch, int64_t batch_offset, int64_t offset_delta,
 								    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
 								    const int parallel_elements = OW * KW * KH;
 								    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
 								    dim3 block_nums(num_blocks, OH, batch * IC);
 								    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static cudaError_t ggml_cuda_cpy_tensor_2d(
 								    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
-												cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
											
										
										
											2023-05-01 11:32:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
 								    char * src_ptr = (char *) src->data;
 								    char * dst_ptr = (char *) dst;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = src->ne[0];
 								    const int64_t nb0 = src->nb[0];
 								    const int64_t nb1 = src->nb[1];
 								    const int64_t nb2 = src->nb[2];
 								    const int64_t nb3 = src->nb[3];
 								    const enum ggml_type type = src->type;
 								    const int64_t ts = ggml_type_size(type);
 								    const int64_t bs = ggml_blck_size(type);
 								    int64_t i1_diff = i1_high - i1_low;
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 22:04:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
 								    if (nb0 == ts && nb1 == ts*ne0/bs) {
 								        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
 								    } else if (nb0 == ts) {
 								        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
 								            const void * rx = (const void *) ((const char *) x + i1*nb1);
 								            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
 								            // pretend the row is a matrix with cols=1
 								            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
 								            if (r != cudaSuccess) {
 								                return r;
 								            }
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 16:04:36 +00:00
+								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        return cudaSuccess;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 13:36:41 +00:00
+								    }
-												cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
											
										
										
											2023-04-29 00:04:18 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_get_rows(
 								    ggml_backend_cuda_context & ctx,
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_I32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 								    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
 								    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int32_t * src1_i32 = (const int32_t *) src1_d;
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F16:
 								            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_F32:
 								            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_Q4_0:
 								            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_Q4_1:
 								            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_Q5_0:
 								            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_Q5_1:
 								            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        case GGML_TYPE_Q8_0:
 								            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
 								            break;
 								        default:
 								            // TODO: k-quants
 								            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								template<class op>
 								static void ggml_cuda_op_bin_bcast(
 								    ggml_backend_cuda_context & ctx,
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
 								        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 								    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
 								        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
 								    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
 								        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
 								    } else {
 								        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
 								            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
 								        GGML_ASSERT(false);
 								    }
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_repeat(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(src1_d);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_add(
 								    ggml_backend_cuda_context & ctx,
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 								}
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_acc(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
 								    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
 								    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
 								    int offset = dst->op_params[3] / 4; // offset in bytes
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(dst);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_mul(
 								    ggml_backend_cuda_context & ctx,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_div(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_gelu(
 								    ggml_backend_cuda_context & ctx,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_silu(
 								    ggml_backend_cuda_context & ctx,
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_gelu_quick(
 								    ggml_backend_cuda_context & ctx,
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_tanh(
 								    ggml_backend_cuda_context & ctx,
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_relu(
 								    ggml_backend_cuda_context & ctx,
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 20:23:59 +00:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(ctx);
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_hardsigmoid(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_hardswish(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 14:37:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 14:37:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 14:37:53 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_leaky_relu(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float negative_slope;
 								    memcpy(&negative_slope, dst->op_params, sizeof(float));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_sqr(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_norm(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Windows nvcc workaround (#1753)

Fix gibberish output on Windows when using CUDA
											
										
										
											2023-06-09 11:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
-												Windows nvcc workaround (#1753)

Fix gibberish output on Windows when using CUDA
											
										
										
											2023-06-09 11:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_group_norm(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int num_groups = dst->op_params[0];
 								    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
 								    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_concat(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
 								        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
 								    }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_upscale(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 19:49:44 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int scale_factor = dst->op_params[0];
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_pad(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    pad_f32_cuda(src0_dd, dst_dd,
 								        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
 								        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_arange(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float start;
 								    float stop;
 								    float step;
 								    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
 								    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
 								    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t steps = (int64_t)ceil((stop - start) / step);
 								    GGML_ASSERT(ggml_nelements(dst) == steps);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
-												cuda : add gelu support

											
										
										
											2023-07-12 17:26:18 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src0);
 								    GGML_UNUSED(src1);
 								    GGML_UNUSED(src0_dd);
 								    GGML_UNUSED(src1_dd);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_timestep_embedding(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int dim = dst->op_params[0];
 								    const int max_period = dst->op_params[1];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_rms_norm(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 08:58:15 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 19:53:34 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_mul_mat_q(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne10 = src1->ne[0];
 								    GGML_ASSERT(ne10 % QK8_1 == 0);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = dst->ne[0];
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t row_diff = row_high - row_low;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id = ggml_cuda_get_device();
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // the main device has a larger memory buffer to hold the results from all GPUs
 								    // nrows_dst == nrows of the matrix that the kernel writes into
 								    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
 								            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q4_1:
 								            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_0:
 								            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_1:
 								            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q8_0:
 								            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q2_K:
 								            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q3_K:
 								            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q4_K:
 								            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_K:
 								            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q6_K:
 								            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
 								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_ddf_i);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_mul_mat_vec_q(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
 								    const int64_t ne00 = src0->ne[0];
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t row_diff = row_high - row_low;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne10 = src1->ne[0];
 								    GGML_ASSERT(ne10 % QK8_1 == 0);
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 12:09:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = dst->ne[0];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // the main device has a larger memory buffer to hold the results from all GPUs
 								    // nrows_dst == nrows of the matrix that the kernel writes into
 								    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
 								            mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q4_1:
 								            mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_0:
 								            mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_1:
 								            mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q8_0:
 								            mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q2_K:
 								            mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q3_K:
 								            mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q4_K:
 								            mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q5_K:
 								            mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_Q6_K:
 								            mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ2_XXS:
 								            mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ2_XS:
 								            mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ2_S:
 								            mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ3_XXS:
 								            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ1_S:
 								            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ4_NL:
 								            mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ4_XS:
 								            mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        case GGML_TYPE_IQ3_S:
 								            mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_ddf_i);
 								    GGML_UNUSED(src1_ncols);
 								    GGML_UNUSED(src1_padded_row_size);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_dequantize_mul_mat_vec(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 								    const int64_t src1_padded_row_size, cudaStream_t stream) {
 								    GGML_UNUSED(ctx);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								    const int64_t ne00 = src0->ne[0];
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t row_diff = row_high - row_low;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 								#ifdef GGML_CUDA_F16
-												cuda : fix LLAMA_CUDA_F16 build (#6197)


											
										
										
											2024-03-21 12:59:53 +00:00
+								    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    half * src1_dfloat = nullptr; // dfloat == half
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    bool src1_convert_f16 =
 								        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
 								        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
 								        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (src1_convert_f16) {
 								        src1_dfloat = src1_dfloat_a.alloc(ne00);
 								        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								        GGML_ASSERT(to_fp16_cuda != nullptr);
 								        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
 								    }
 								#else
 								    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
 								#endif // GGML_CUDA_F16
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
 								            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q4_1:
 								            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_0:
 								            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_1:
 								            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q8_0:
 								            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q2_K:
 								            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q3_K:
 								            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q4_K:
 								            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_K:
 								            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q6_K:
 								            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_F16:
 								            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_ddq_i);
 								    GGML_UNUSED(src1_ncols);
 								    GGML_UNUSED(src1_padded_row_size);
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_mul_mat_cublas(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src0_dd_i  != nullptr);
 								    GGML_ASSERT(src1_ddf_i != nullptr);
 								    GGML_ASSERT(dst_dd_i   != nullptr);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne10 = src1->ne[0];
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 22:10:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne0 = dst->ne[0];
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 22:10:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t row_diff = row_high - row_low;
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 22:10:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int id = ggml_cuda_get_device();
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 22:10:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // the main device has a larger memory buffer to hold the results from all GPUs
 								    // ldc == nrows of the matrix that cuBLAS writes into
 								    int ldc = id == ctx.device ? ne0 : row_diff;
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int compute_capability = get_cuda_global_info().devices[id].cc;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
 								        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
 								        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
 								        if (src0->type != GGML_TYPE_F16) {
 								            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
 								            GGML_ASSERT(to_fp16_cuda != nullptr);
 								            size_t ne = row_diff*ne00;
 								            src0_as_f16.alloc(ne);
 								            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
 								        }
 								        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
 								        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
 								        if (src1->type != GGML_TYPE_F16) {
 								            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								            GGML_ASSERT(to_fp16_cuda != nullptr);
 								            size_t ne = src1_ncols*ne10;
 								            src1_as_f16.alloc(ne);
 								            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
 								        }
 								        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
 								        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const half alpha_f16 = 1.0f;
 								        const half beta_f16 = 0.0f;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
 								        CUBLAS_CHECK(
 								            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
 								                    row_diff, src1_ncols, ne10,
 								                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
 								                                src1_ptr,       CUDA_R_16F, ne10,
 								                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
 								                    CUBLAS_COMPUTE_16F,
 								                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
 								        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
 								    } else {
 								        ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
 								        ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (src0->type != GGML_TYPE_F32) {
 								            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
 								            GGML_ASSERT(to_fp32_cuda != nullptr);
 								            src0_ddq_as_f32.alloc(row_diff*ne00);
 								            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
 								        }
 								        if (src1->type != GGML_TYPE_F32) {
 								            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
 								            GGML_ASSERT(to_fp32_cuda != nullptr);
 								            src1_ddq_as_f32.alloc(src1_ncols*ne10);
 								            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
 								        }
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
 								        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const float alpha = 1.0f;
 								        const float beta = 0.0f;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
 								        CUBLAS_CHECK(
 								            cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
 								                    row_diff, src1_ncols, ne10,
 								                    &alpha, src0_ddf_i,  ne00,
 								                            src1_ddf1_i, ne10,
 								                    &beta,  dst_dd_i,    ldc));
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 08:32:31 +00:00
+								    }
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_ddq_i);
 								    GGML_UNUSED(src1_padded_row_size);
 								}
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_rope(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src0->type == dst->type);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne2 = dst->ne[2];
 								    const int64_t nrows = ggml_nrows(src0);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //const int n_past      = ((int32_t *) dst->op_params)[0];
 								    const int n_dims      = ((int32_t *) dst->op_params)[1];
 								    const int mode        = ((int32_t *) dst->op_params)[2];
 								    const int n_ctx       = ((int32_t *) dst->op_params)[3];
 								    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // RoPE alteration for extended context
 								    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 								    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
 								    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
 								    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
 								    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
 								    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
 								    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int32_t * pos = nullptr;
 								    if ((mode & 1) == 0) {
 								        GGML_ASSERT(src1->type == GGML_TYPE_I32);
 								        GGML_ASSERT(src1->ne[0] == ne2);
 								        pos = (const int32_t *) src1_dd;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 17:27:47 +00:00
+								    }
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool is_neox = mode & 2;
 								    const bool is_glm  = mode & 4;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    rope_corr_dims corr_dims;
 								    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // compute
 								    if (is_glm) {
 								        GGML_ASSERT(false);
 								        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
 								    } else if (is_neox) {
 								        if (src0->type == GGML_TYPE_F32) {
 								            rope_neox_cuda(
 								                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
 								        } else if (src0->type == GGML_TYPE_F16) {
 								            rope_neox_cuda(
 								                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
 								        } else {
 								            GGML_ASSERT(false);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
+								        }
 								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (src0->type == GGML_TYPE_F32) {
 								            rope_cuda(
 								                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
 								        } else if (src0->type == GGML_TYPE_F16) {
 								            rope_cuda(
 								                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
 								        } else {
 								            GGML_ASSERT(false);
 								        }
 								    }
-												cuda : fix const ptrs warning causing ROCm build issues (#3913)


											
										
										
											2023-11-02 18:32:11 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 22:10:09 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_alibi(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												cuda : revert CUDA pool stuff (#3944)

* Revert "cuda : add ROCM aliases for CUDA pool stuff (#3918)"

This reverts commit 629f917cd6b96ba1274c49a8aab163b1b189229d.

* Revert "cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903)"

This reverts commit d6069051de7165a4e06662c89257f5d2905bb156.

ggml-ci
											
										
										
											2023-11-05 07:12:13 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
 								    const int64_t nrows = ggml_nrows(src0);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //const int n_past = ((int32_t *) dst->op_params)[0];
 								    const int n_head = ((int32_t *) dst->op_params)[1];
 								    float max_bias;
 								    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //GGML_ASSERT(ne01 + n_past == ne00);
 								    GGML_ASSERT(n_head == ne02);
-												ggml-cuda : fix f16 mul mat (#3961)

* ggml-cuda : fix f16 mul mat

ggml-ci

* silence common.cpp warning (bonus)
											
										
										
											2023-11-05 17:45:16 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 17:55:51 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(src1_dd);
 								}
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_pool2d(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    const int32_t * opts = (const int32_t *)dst->op_params;
 								    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
 								    const int k0 = opts[1];
 								    const int k1 = opts[2];
 								    const int s0 = opts[3];
 								    const int s1 = opts[4];
 								    const int p0 = opts[5];
 								    const int p1 = opts[6];
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t IH = src0->ne[1];
 								    const int64_t IW = src0->ne[0];
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t N = dst->ne[3];
 								    const int64_t OC = dst->ne[2];
 								    const int64_t OH = dst->ne[1];
 								    const int64_t OW = dst->ne[0];
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 22:12:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int parallel_elements = N * OC * OH * OW;
 								    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
 								    dim3 block_nums(num_blocks);
 								    pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(src1_dd);
 								}
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 22:12:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_im2col(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 22:12:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 								    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
 								    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
 								    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
 								    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
 								    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 18:08:39 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t IC = src1->ne[is_2D ? 2 : 1];
 								    const int64_t IH = is_2D ? src1->ne[1] : 1;
 								    const int64_t IW =         src1->ne[0];
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 22:12:53 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t KH = is_2D ? src0->ne[1] : 1;
 								    const int64_t KW =         src0->ne[0];
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 14:01:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t OH = is_2D ? dst->ne[2] : 1;
 								    const int64_t OW =         dst->ne[1];
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 15:02:32 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
 								    const int64_t batch = src1->ne[3];
 								    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 13:48:37 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if(dst->type == GGML_TYPE_F16) {
 								        im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								    }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src0);
 								    GGML_UNUSED(src0_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_sum_rows(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ncols = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_argsort(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ncols = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_diag_mask_inf(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int nrows0 = ggml_nrows(src0);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int n_past = ((int32_t *) dst->op_params)[0];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_soft_max(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00    = src0->ne[0];
 								    const int64_t nrows_x = ggml_nrows(src0);
 								    const int64_t nrows_y = src0->ne[1];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float scale    = 1.0f;
 								    float max_bias = 0.0f;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
 								    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // positions tensor
 								    float * src2_dd = nullptr;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor * src2 = dst->src[2];
 								    const bool use_src2 = src2 != nullptr;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (use_src2) {
 								        src2_dd = (float *)src2->data;
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_scale(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float scale;
 								    memcpy(&scale, dst->op_params, sizeof(float));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
 								    CUDA_CHECK(cudaGetLastError());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_clamp(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_UNUSED(ctx);
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    float min;
 								    float max;
 								    memcpy(&min, dst->op_params, sizeof(float));
 								    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
 								    CUDA_CHECK(cudaGetLastError());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								    GGML_UNUSED(src1_dd);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								// TODO: remove this function
 								static void ggml_cuda_op_flatten(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
 								    GGML_ASSERT(!src0 || ggml_backend_buffer_is_cuda(src0->buffer));
 								    GGML_ASSERT(!src1 || ggml_backend_buffer_is_cuda(src1->buffer));
 								    GGML_ASSERT(         ggml_backend_buffer_is_cuda(dst->buffer));
 								    // dd = data device
 								    float * src0_ddf = src0 ? (float *) src0->data : nullptr;
 								    float * src1_ddf = src1 ? (float *) src1->data : nullptr;
 								    float *  dst_ddf = (float *) dst->data;
 								    ggml_cuda_set_device(ctx.device);
 								    // do the computation
 								    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, ctx.stream());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    CUDA_CHECK(cudaGetLastError());
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 								    static bool peer_access_enabled = false;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
 								    if (peer_access_enabled == enable_peer_access) {
 								        return;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								#ifdef NDEBUG
 								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        ggml_cuda_set_device(id);
 								        CUDA_CHECK(cudaDeviceSynchronize());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        ggml_cuda_set_device(id);
 								        for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
 								            if (id == id_other) {
 								                continue;
 								            }
 								            if (id != main_device && id_other != main_device) {
 								                continue;
 								            }
 								            int can_access_peer;
 								            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
 								            if (can_access_peer) {
 								                if (enable_peer_access) {
 								                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
 								                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
 								                        CUDA_CHECK(err);
 								                    }
 								                } else {
 								                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
 								                    if (err != cudaErrorPeerAccessNotEnabled) {
 								                        CUDA_CHECK(err);
 								                    }
 								                }
 								            }
 								        }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif // NDEBUG
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    peer_access_enabled = enable_peer_access;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(main_device);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_op_mul_mat(
 								    ggml_backend_cuda_context & ctx,
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
 								    const bool convert_src1_to_q8_1) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
 								    const int64_t ne03 = src0->ne[3];
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
 								    const int64_t ne13 = src1->ne[3];
 								    const int64_t nrows1 = ggml_nrows(src1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne03 == ne13);
 								    const int64_t ne0 = dst->ne[0];
 								    const int64_t ne1 = dst->ne[1];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int nb2 = dst->nb[2];
 								    const int nb3 = dst->nb[3];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
 								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
 								    ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
 								    ggml_backend_cuda_buffer_context * dst_ctx  = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 17:42:59 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t i02_divisor = ne12 / ne02;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t src0_ts = ggml_type_size(src0->type);
 								    const size_t src0_bs = ggml_blck_size(src0->type);
 								    const size_t q8_1_ts = sizeof(block_q8_1);
 								    const size_t q8_1_bs = QK8_1;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool src0_is_contiguous = ggml_is_contiguous(src0);
 								    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
 								    GGML_ASSERT(!(split && ne02 > 1));
 								    GGML_ASSERT(!(split && ne03 > 1));
 								    GGML_ASSERT(!(split && ne02 < ne12));
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 								    if (split) {
 								        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
 								        tensor_split = buft_ctx->tensor_split;
 								    }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    struct dev_data {
 								        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
 								        ggml_cuda_pool_alloc<float> src1_ddf_alloc;
 								        ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
 								        ggml_cuda_pool_alloc<float>   dst_dd_alloc;
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        char  *  src0_dd = nullptr;
 								        float * src1_ddf = nullptr; // float
 								        char  * src1_ddq = nullptr; // q8_1
 								        float *   dst_dd = nullptr;
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        int64_t  row_low;
 								        int64_t row_high;
 								    };
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    dev_data dev[GGML_CUDA_MAX_DEVICES];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int used_devices = 0;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        // by default, use all rows
 								        dev[id].row_low  = 0;
 								        dev[id].row_high = ne01;
 								        // for multi GPU, get the row boundaries from tensor split
 								        // and round to mul_mat_q tile sizes
 								        if (split) {
 								            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
 								            if (id != 0) {
 								                dev[id].row_low  = ne01*tensor_split[id];
 								                if (dev[id].row_low < ne01) {
 								                    dev[id].row_low -= dev[id].row_low % rounding;
 								                }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
+								            }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            if (id != ggml_backend_cuda_get_device_count() - 1) {
 								                dev[id].row_high  = ne01*tensor_split[id + 1];
 								                if (dev[id].row_high < ne01) {
 								                    dev[id].row_high -= dev[id].row_high % rounding;
 								                }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
+								            }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        }
 								    }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								        if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
 								            continue;
 								        }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        used_devices++;
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        const bool src1_on_device = id == src1_ctx->device;
 								        const bool  dst_on_device = id == dst_ctx->device;
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        ggml_cuda_set_device(id);
 								        cudaStream_t stream = ctx.stream(id, 0);
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (src0_is_contiguous) {
 								            dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
 								        } else {
 								            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
 								        }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (src1_on_device && src1_is_contiguous) {
 								            dev[id].src1_ddf = (float *) src1->data;
 								        } else {
 								            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
 								        }
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        if (convert_src1_to_q8_1) {
 								            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            if (src1_on_device && src1_is_contiguous) {
 								                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
 								                CUDA_CHECK(cudaGetLastError());
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
+								            }
 								        }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								        if (dst_on_device) {
 								            dev[id].dst_dd = (float *) dst->data;
 								        } else {
 								            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
 								            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
 								        }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // if multiple devices are used they need to wait for the main device
 								    // here an event is recorded that signals that the main device has finished calculating the input data
 								    if (split && used_devices > 1) {
 								        ggml_cuda_set_device(ctx.device);
 								        CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
 								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
 								    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
 								        const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
 								        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 07:50:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								            if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
 								                continue;
 								            }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            const bool src1_on_device = id == src1_ctx->device;
 								            const bool  dst_on_device = id == dst_ctx->device;
 								            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            ggml_cuda_set_device(id);
 								            cudaStream_t stream = ctx.stream(id, is);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            // wait for main GPU data if necessary
 								            if (split && (id != ctx.device || is != 0)) {
 								                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
 								            }
-												sync : ggml (#0)

											
										
										
											2024-01-30 14:21:57 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
 								                const int64_t i03 = i0 / ne12;
 								                const int64_t i02 = i0 % ne12;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
 								                // for split tensors the data begins at i0 == i0_offset_low
 								                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
 								                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
 								                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
 								                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                // the main device memory buffer can be on VRAM scratch, with space for all partial results
 								                // in that case an offset on dst_ddf_i is needed
 								                if (id == ctx.device) {
 								                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
 								                }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                // copy src0, src1 to device if necessary
 								                if (src1_is_contiguous) {
 								                    if (id != ctx.device) {
 								                        if (convert_src1_to_q8_1) {
 								                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
 								                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
 								                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
 								                        } else {
 								                            float * src1_ddf_i_source = (float *) src1->data;
 								                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
 								                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
 								                                                            src1_ncols*ne10*sizeof(float), stream));
 								                        }
 								                    }
 								                } else if (src1_on_device && !src1_is_contiguous) {
 								                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
 								                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
 								                } else {
 								                    GGML_ASSERT(false);
 								                }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
 								                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
 								                    CUDA_CHECK(cudaGetLastError());
 								                }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
 								                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
 								                }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                // do the computation
 								                op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
 								                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
 								                CUDA_CHECK(cudaGetLastError());
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                // copy dst to host or other device if necessary
 								                if (!dst_on_device) {
 								                    void * dst_off_device = dst->data;
 								                    if (split) {
 								                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
 								                        // dst is NOT transposed.
 								                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
 								                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
 								                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
 								                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
 								                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
 								                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
 								#if !defined(GGML_USE_HIPBLAS)
 								                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
 								                        cudaMemcpy3DPeerParms p = {};
 								                        p.dstDevice = ctx.device;
 								                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
 								                        p.srcDevice = id;
 								                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
 								                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
 								                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
 								#else
 								                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
 								                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
 								                                                        dst_dd_i, row_diff*sizeof(float),
 								                                                        row_diff*sizeof(float), src1_ncols,
 								                                                        cudaMemcpyDeviceToDevice, stream));
 								#endif
 								                    } else {
 								                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
 								                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
 								                        dhf_dst_i += src1_col_0*ne0;
 								                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
 								                    }
 								                }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                // add event for the main device to wait on until other device is done
 								                if (split && (id != ctx.device || is != 0)) {
 								                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
 								                }
 								            }
 								        }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // main device waits for all other devices to be finished
 								    if (split && ggml_backend_cuda_get_device_count() > 1) {
 								        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
 								        is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        ggml_cuda_set_device(ctx.device);
 								        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								            if (dev[id].row_low == dev[id].row_high) {
 								                continue;
 								            }
 								            for (int64_t is = 0; is < is_max; ++is) {
 								                CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
 								            }
 								        }
 								    }
-												Support dup & cont ops on CUDA (#2242)


											
										
										
											2023-07-17 17:39:29 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_repeat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_repeat);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_get_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_get_rows);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_add(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_add);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 19:33:23 +00:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_acc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_acc);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 11:22:08 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_mul);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_div(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_div);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 14:55:52 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_gelu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_silu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_silu);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_gelu_quick(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu_quick);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 16:11:07 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_tanh(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_tanh);
-												ggml : use ggml_row_size where possible (#4472)

* ggml : use ggml_row_size where possible

ggml-ci

* ggml : move ggml_nbytes_split to ggml-cuda.cu
											
										
										
											2023-12-14 19:05:21 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_relu);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 17:47:19 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_hardsigmoid(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardsigmoid);
 								}
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_hardswish(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardswish);
 								}
 								static void ggml_cuda_leaky_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_leaky_relu);
 								}
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_sqr(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sqr);
 								}
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 14:41:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_norm);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 12:19:28 +00:00
+								}
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 10:44:58 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_group_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_group_norm);
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 10:44:58 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_concat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_concat);
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 10:44:58 +00:00
+								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_upscale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_upscale);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_pad(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pad);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_arange(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_arange);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_timestep_embedding(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_timestep_embedding);
 								}
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_rms_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rms_norm);
 								}
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
 								    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
 								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
 								    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
 								    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne12 = src1->ne[2];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx.device);
 								    cudaStream_t main_stream = ctx.stream();
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void  * src0_ddq = src0->data;
 								    float * src1_ddf = (float *) src1->data;
 								    float * dst_ddf  = (float *) dst->data;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
 								    GGML_ASSERT(!ggml_is_transposed(src0));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
 								    GGML_ASSERT(!ggml_is_permuted(src0));
 								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t nb01 = src0->nb[1];
 								    const int64_t nb02 = src0->nb[2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne12 = src1->ne[2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx.device);
 								    cudaStream_t main_stream = ctx.stream();
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    void  * src0_ddq = src0->data;
 								    float * src1_ddf = (float *) src1->data;
 								    float * dst_ddf  = (float *) dst->data;
 								    const int64_t row_stride_x = nb01 / sizeof(half);
 								    const int64_t channel_stride_x = nb02 / sizeof(half);
 								    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static __global__ void k_compute_batched_ptrs(
 								        const half * src0_as_f16, const half * src1_as_f16, char * dst,
 								        const void ** ptrs_src, void ** ptrs_dst,
 								        int64_t ne12, int64_t ne13,
 								        int64_t ne23,
 								        size_t  nb02, size_t  nb03,
 								        size_t  nb12, size_t  nb13,
 								        size_t  nbd2, size_t  nbd3,
 								        int64_t r2,   int64_t r3) {
 								    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
 								    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i13 >= ne13 || i12 >= ne12) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        return;
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t i03 = i13 / r3;
 								    int64_t i02 = i12 / r2;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
 								    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
 								    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(!ggml_is_transposed(src0));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
 								    const int64_t ne_dst = ggml_nelements(dst);
 								    ggml_cuda_set_device(ctx.device);
 								    cudaStream_t main_stream = ctx.stream();
 								    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
 								    void * src0_ddq = src0->data;
 								    half * src0_f16 = (half *) src0_ddq;
 								    float * src1_ddf = (float *) src1->data;
 								    float * dst_ddf  = (float *) dst->data;
 								    // convert src1 to fp16
 								    ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
 								    if (src1->type != GGML_TYPE_F16) {
 								        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								        const int64_t ne_src1 = ggml_nelements(src1);
 								        src1_f16_alloc.alloc(ne_src1);
 								        GGML_ASSERT(to_fp16_cuda != nullptr);
 								        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
 								    char * dst_t;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
 								    cudaDataType_t      cu_data_type    = CUDA_R_16F;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // dst strides
 								    size_t nbd2 = dst->nb[2];
 								    size_t nbd3 = dst->nb[3];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const half  alpha_f16 = 1.0f;
 								    const half  beta_f16  = 0.0f;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const float alpha_f32 = 1.0f;
 								    const float beta_f32  = 0.0f;
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 20:07:46 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const void * alpha = &alpha_f16;
 								    const void * beta  = &beta_f16;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
 								        dst_t = (char *) dst_f16.alloc(ne_dst);
 								        nbd2 /= sizeof(float) / sizeof(half);
 								        nbd3 /= sizeof(float) / sizeof(half);
 								    } else {
 								        dst_t = (char *) dst_ddf;
 								        cu_compute_type = CUBLAS_COMPUTE_32F;
 								        cu_data_type    = CUDA_R_32F;
 								        alpha = &alpha_f32;
 								        beta  = &beta_f32;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne12 % ne02 == 0);
 								    GGML_ASSERT(ne13 % ne03 == 0);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 20:07:46 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if 0
 								    // use cublasGemmEx
 								    {
 								        for (int i13 = 0; i13 < ne13; ++i13) {
 								            for (int i12 = 0; i12 < ne12; ++i12) {
 								                int i03 = i13 / r3;
 								                int i02 = i12 / r2;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                CUBLAS_CHECK(
 								                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
 								                            ne01, ne11, ne10,
 								                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
 								                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
 								                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
 								                            cu_compute_type,
 								                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								            }
 								        }
 								    }
 								#else
 								    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
 								        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
 								        // use cublasGemmStridedBatchedEx
 								        CUBLAS_CHECK(
 								        cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
 								                ne01, ne11, ne10,
 								                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
 								                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
 								                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
 								                ne12*ne13,
 								                cu_compute_type,
 								                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    } else {
 								        // use cublasGemmBatchedEx
 								        const int ne23 = ne12*ne13;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
 								        ggml_cuda_pool_alloc<      void *> ptrs_dst(ctx.pool(), 1*ne23);
 								        dim3 block_dims(ne13, ne12);
 								        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
 								                src0_f16, src1_f16, dst_t,
 								                ptrs_src.get(), ptrs_dst.get(),
 								                ne12, ne13,
 								                ne23,
 								                nb02, nb03,
 								                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
 								                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
 								                nbd2, nbd3,
 								                r2, r3);
 								        CUDA_CHECK(cudaGetLastError());
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUBLAS_CHECK(
 								        cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
 								                ne01, ne11, ne10,
 								                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
 								                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
 								                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
 								                ne23,
 								                cu_compute_type,
 								                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    }
 								#endif
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
 								        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
 								        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
 								    }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t min_compute_capability = INT_MAX;
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 12:16:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    bool any_pascal_with_slow_fp16 = false;
 								    if (split) {
 								        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
 								        auto & tensor_split = buft_ctx->tensor_split;
 								        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
 								            // skip devices that are not going to do any work:
 								            if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
 								                continue;
 								            }
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 12:16:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            if (min_compute_capability > get_cuda_global_info().devices[id].cc) {
 								                min_compute_capability = get_cuda_global_info().devices[id].cc;
 								            }
 								            if (get_cuda_global_info().devices[id].cc == 610) {
 								                any_pascal_with_slow_fp16 = true;
 								            }
 								        }
 								    } else {
 								        min_compute_capability    = get_cuda_global_info().devices[ctx.device].cc;
 								        any_pascal_with_slow_fp16 = get_cuda_global_info().devices[ctx.device].cc == 610;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 12:16:23 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // check data types and tensor shapes for custom matrix multiplication kernels:
 								    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
 								        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
 								        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef CUDA_USE_TENSOR_CORES
 								    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
 								#endif // CUDA_USE_TENSOR_CORES
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#else
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
 								    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
 								    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
 								    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#ifdef CUDA_USE_TENSOR_CORES
 								    // when tensor cores are available, use them for large batch size
 								    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
 								    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 								#endif // CUDA_USE_TENSOR_CORES
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // if mmvq is available it's a better choice than dmmv:
 								#ifndef GGML_CUDA_FORCE_DMMV
 								    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 								#endif // GGML_CUDA_FORCE_DMMV
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // debug helpers
 								    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
 								    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
 								    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
 								    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
 								    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
 								    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
 								        // KQ single-batch
 								        ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
 								    } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
 								        // KQV single-batch
 								        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
 								    } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
 								        // KQ + KQV multi-batch
 								        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
 								    } else if (use_dequantize_mul_mat_vec) {
 								        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
 								    } else if (use_mul_mat_vec_q) {
 								        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
 								    } else if (use_mul_mat_q) {
 								        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
 								    } else {
 								        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#if 0
 								template<typename ... Srcs>
 								static __global__ void k_compute_batched_ptrs_id(
 								        const void ** ptrs_src, void ** ptrs_dst,
 								        int ne12, int ne13,
 								        int ne23,
 								        int nb02, int nb03,
 								        int nb12, int nb13,
 								        int nb2, int nb3,
 								        int r2, int r3,
 								        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
 								        const half * src1_f16, half * dst_f16,
 								        const int32_t * ids, const int id,
 								        Srcs... src0s) {
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 20:07:46 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int i = ids[id];
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 20:07:46 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    half * src0_f16;
 								    const void * srcs_ar[] = { (const half *) src0s... };
 								    if (src0_type == GGML_TYPE_F16) {
 								        src0_f16 = (half *) srcs_ar[i];
 								    } else {
 								        src0_f16 = src0_as_f16;
 								        if (threadIdx.x == 0 && threadIdx.y == 0) {
 								            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
 								            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								        }
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
 								    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (i13 >= ne13 || i12 >= ne12) {
 								        return;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int i03 = i13 / r3;
 								    int i02 = i12 / r2;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
 								    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
 								    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
 								    const struct ggml_tensor * ids = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
 								    const struct ggml_tensor * src00 = dst->src[2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int id = dst->op_params[0];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(!ggml_is_transposed(src00));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
 								    const int64_t ne01 = src00->ne[1];
 								    const int64_t ne02 = src00->ne[2];
 								    const int64_t ne03 = src00->ne[3];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //const int64_t nb01 = src00->nb[1];
 								    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
 								    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
 								    const int64_t ne13 = src1->ne[3];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //const int64_t nb11 = src1->nb[1];
 								    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
 								    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne1 = ggml_nelements(src1);
 								    const int64_t ne  = ggml_nelements(dst);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(g_main_device);
 								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 								    //void * src0_ddq = src0_extra->data_device[g_main_device];
 								    //half * src0_as_f16 = (half *) src0_ddq;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 								    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
 								    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // convert src1 to fp16
 								    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								    GGML_ASSERT(to_fp16_cuda != nullptr);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    size_t src1_as = 0;
 								    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
 								    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    size_t dst_as = 0;
 								    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_ASSERT(ne12 % ne02 == 0);
 								    GGML_ASSERT(ne13 % ne03 == 0);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const half alpha_f16 = 1.0f;
 								    const half beta_f16  = 0.0f;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    // use cublasGemmBatchedEx
 								    const int ne23 = ne12*ne13;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const void ** ptrs_src = nullptr;
 								          void ** ptrs_dst = nullptr;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    size_t ptrs_src_s = 0;
 								    size_t ptrs_dst_s = 0;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
 								    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int64_t src0_ne = ggml_nelements(src00);
 								    half * src0_as_f16 = nullptr;
 								    size_t src0_as = 0;
 								    if (src00->type != GGML_TYPE_F16) {
 								        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
 								    dim3 block_dims(ne13, ne12);
 								    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
 								            ptrs_src, ptrs_dst,
 								            ne12, ne13,
 								            ne23,
 								            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
 								            nb12, nb13,
 								            dst->nb[2], dst->nb[3],
 								            r2, r3,
 								            src00->type, src0_as_f16, src0_ne,
 								            src1_as_f16, dst_f16,
 								            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
 								            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
 								    );
 								    CUDA_CHECK(cudaGetLastError());
 								    CUBLAS_CHECK(
 								    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
 								            ne01, ne11, ne10,
 								            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
 								                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
 								            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
 								            ne23,
 								            CUBLAS_COMPUTE_16F,
 								            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    if (src0_as != 0) {
 								        ggml_cuda_pool_free(src0_as_f16, src0_as);
 								    }
 								    if (ptrs_src_s != 0) {
 								        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
 								    }
 								    if (ptrs_dst_s != 0) {
 								        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
 								    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
 								    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
 								    ggml_cuda_pool_free(src1_as_f16, src1_as);
 								    ggml_cuda_pool_free(dst_f16, dst_as);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								#endif
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								#if 0
 								    ggml_cuda_mul_mat_id_cublas(dst);
 								    // TODO: mmq/mmv support
 								#endif
 								    cudaStream_t stream = ctx.stream();
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const size_t nb11 = src1->nb[1];
 								    const size_t nb1  =  dst->nb[1];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const struct ggml_tensor * ids = src0;
 								    const int32_t id = ((int32_t *) dst->op_params)[0];
 								    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    std::vector<char> ids_host(ggml_nbytes(ids));
 								    const char * ids_dev = (const char *) ids->data;
 								    CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
 								    CUDA_CHECK(cudaStreamSynchronize(stream));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_tensor src1_row = *src1;
 								    ggml_tensor dst_row = *dst;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    char * src1_original = (char *) src1->data;
 								    char * dst_original  = (char *)  dst->data;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (src1->ne[1] == 1) {
 								        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            src1_row.data = src1_original + i01*src1->nb[1];
 								            dst_row.data  =  dst_original + i01*dst->nb[1];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
 								        }
 								    } else {
 								        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
 								        ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        src1_row.data = src1_contiguous.get();
 								        dst_row.data  =  dst_contiguous.get();
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
 								            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            int64_t num_src1_rows = 0;
 								            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                if (row_id_i != row_id) {
 								                    continue;
 								                }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
 								                                        nb11, cudaMemcpyDeviceToDevice, stream));
 								                num_src1_rows++;
 								            }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            if (num_src1_rows == 0) {
 								                continue;
 								            }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            src1_row.ne[1] = num_src1_rows;
 								            dst_row.ne[1] = num_src1_rows;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            src1_row.nb[1] = nb11;
 								            src1_row.nb[2] = num_src1_rows*nb11;
 								            src1_row.nb[3] = num_src1_rows*nb11;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            dst_row.nb[1] = nb1;
 								            dst_row.nb[2] = num_src1_rows*nb1;
 								            dst_row.nb[3] = num_src1_rows*nb1;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            num_src1_rows = 0;
 								            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                if (row_id_i != row_id) {
 								                    continue;
 								                }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								                GGML_ASSERT(row_id >= 0 && row_id < n_as);
 								                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
 								                                        nb1, cudaMemcpyDeviceToDevice, stream));
 								                num_src1_rows++;
 								            }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								        }
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_scale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_scale);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_clamp(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_clamp);
 								}
 								static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    const int64_t ne = ggml_nelements(src0);
 								    GGML_ASSERT(ne == ggml_nelements(src1));
 								    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
 								    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
 								    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //GGML_ASSERT(src0->ne[3] == 1);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t nb00 = src0->nb[0];
 								    const int64_t nb01 = src0->nb[1];
 								    const int64_t nb02 = src0->nb[2];
 								    const int64_t nb03 = src0->nb[3];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    //GGML_ASSERT(src1->ne[3] == 1);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    const int64_t nb10 = src1->nb[0];
 								    const int64_t nb11 = src1->nb[1];
 								    const int64_t nb12 = src1->nb[2];
 								    const int64_t nb13 = src1->nb[3];
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(ctx.device);
 								    cudaStream_t main_stream = ctx.stream();
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    char * src0_ddc = (char *) src0->data;
 								    char * src1_ddc = (char *) src1->data;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
 								        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
 								        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
 								        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
 								        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
 								        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												Add ability to use Q5_0, Q5_1, and IQ4_NL for quantized K cache (#6183)

* k_cache: be able to use Q5_0

* k_cache: be able to use Q5_1 on CODA

* k_cache: be able to use Q5_0 on Metal

* k_cache: be able to use Q5_1 on Metal

* k_cache: be able to use IQ4_NL - just CUDA for now

* k_cache: be able to use IQ4_NL on Metal

* k_cache: add newly added supported types to llama-bench and CUDA supports_op

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-21 07:27:57 +00:00
+								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
 								        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
 								        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
 								        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
 								        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
 								        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
 								                ggml_type_name(src0->type), ggml_type_name(src1->type));
 								        GGML_ASSERT(false);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(dst);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_dup(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    // TODO: why do we pass dst as src1 here?
 								    ggml_cuda_cpy(ctx, src0, dst, nullptr);
 								    GGML_UNUSED(src1);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_diag_mask_inf(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_soft_max(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_soft_max);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_rope(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rope);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_alibi(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_alibi);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_pool2d(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pool2d);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_im2col(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_im2col);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_sum_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sum_rows);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_argsort(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_argsort);
 								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 13:34:22 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								static void ggml_cuda_nop(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_UNUSED(ctx);
 								    GGML_UNUSED(src0);
 								    GGML_UNUSED(src1);
 								    GGML_UNUSED(dst);
 								}
 								static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * tensor) {
 								    // FIXME: where should this be?
 								    if (tensor->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(tensor->src[0]->buffer)) {
 								        ggml_cuda_set_peer_access(tensor->src[1]->ne[1], ctx.device);
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 15:10:51 +00:00
+								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_func_t func;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    switch (tensor->op) {
 								        case GGML_OP_REPEAT:
 								            func = ggml_cuda_repeat;
 								            break;
 								        case GGML_OP_GET_ROWS:
 								            func = ggml_cuda_get_rows;
 								            break;
 								        case GGML_OP_DUP:
 								            func = ggml_cuda_dup;
 								            break;
 								        case GGML_OP_ADD:
 								            func = ggml_cuda_add;
 								            break;
 								        case GGML_OP_ACC:
 								            func = ggml_cuda_acc;
 								            break;
 								        case GGML_OP_MUL:
 								            func = ggml_cuda_mul;
 								            break;
 								        case GGML_OP_DIV:
 								            func = ggml_cuda_div;
 								            break;
 								        case GGML_OP_UNARY:
 								            switch (ggml_get_unary_op(tensor)) {
 								                case GGML_UNARY_OP_GELU:
 								                    func = ggml_cuda_gelu;
 								                    break;
 								                case GGML_UNARY_OP_SILU:
 								                    func = ggml_cuda_silu;
 								                    break;
 								                case GGML_UNARY_OP_GELU_QUICK:
 								                    func = ggml_cuda_gelu_quick;
 								                    break;
 								                case GGML_UNARY_OP_TANH:
 								                    func = ggml_cuda_tanh;
 								                    break;
 								                case GGML_UNARY_OP_RELU:
 								                    func = ggml_cuda_relu;
 								                    break;
 								                case GGML_UNARY_OP_HARDSIGMOID:
 								                    func = ggml_cuda_hardsigmoid;
 								                    break;
 								                case GGML_UNARY_OP_HARDSWISH:
 								                    func = ggml_cuda_hardswish;
 								                    break;
 								                default:
 								                    return false;
 								            }
 								            break;
 								        case GGML_OP_NORM:
 								            func = ggml_cuda_norm;
 								            break;
 								        case GGML_OP_GROUP_NORM:
 								            func = ggml_cuda_group_norm;
 								            break;
 								        case GGML_OP_CONCAT:
 								            func = ggml_cuda_concat;
 								            break;
 								        case GGML_OP_UPSCALE:
 								            func = ggml_cuda_upscale;
 								            break;
 								        case GGML_OP_PAD:
 								            func = ggml_cuda_pad;
 								            break;
 								        case GGML_OP_ARANGE:
 								            func = ggml_cuda_arange;
 								            break;
 								        case GGML_OP_TIMESTEP_EMBEDDING:
 								            func = ggml_cuda_timestep_embedding;
 								            break;
 								        case GGML_OP_LEAKY_RELU:
 								            func = ggml_cuda_leaky_relu;
 								            break;
 								        case GGML_OP_RMS_NORM:
 								            func = ggml_cuda_rms_norm;
 								            break;
 								        case GGML_OP_MUL_MAT:
 								            if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
 								                fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
 								                return false;
 								            } else {
 								                func = ggml_cuda_mul_mat;
 								            }
 								            break;
 								        case GGML_OP_MUL_MAT_ID:
 								            func = ggml_cuda_mul_mat_id;
 								            break;
 								        case GGML_OP_SCALE:
 								            func = ggml_cuda_scale;
 								            break;
 								        case GGML_OP_SQR:
 								            func = ggml_cuda_sqr;
 								            break;
 								        case GGML_OP_CLAMP:
 								            func = ggml_cuda_clamp;
 								            break;
 								        case GGML_OP_CPY:
 								            func = ggml_cuda_cpy;
 								            break;
 								        case GGML_OP_CONT:
 								            func = ggml_cuda_dup;
 								            break;
 								        case GGML_OP_NONE:
 								        case GGML_OP_RESHAPE:
 								        case GGML_OP_VIEW:
 								        case GGML_OP_PERMUTE:
 								        case GGML_OP_TRANSPOSE:
 								            func = ggml_cuda_nop;
 								            break;
 								        case GGML_OP_DIAG_MASK_INF:
 								            func = ggml_cuda_diag_mask_inf;
 								            break;
 								        case GGML_OP_SOFT_MAX:
 								            func = ggml_cuda_soft_max;
 								            break;
 								        case GGML_OP_ROPE:
 								            func = ggml_cuda_rope;
 								            break;
 								        case GGML_OP_ALIBI:
 								            func = ggml_cuda_alibi;
 								            break;
 								        case GGML_OP_IM2COL:
 								            func = ggml_cuda_im2col;
 								            break;
 								        case GGML_OP_POOL_2D:
 								            func = ggml_cuda_pool2d;
 								            break;
 								        case GGML_OP_SUM_ROWS:
 								            func = ggml_cuda_sum_rows;
 								            break;
 								        case GGML_OP_ARGSORT:
 								            func = ggml_cuda_argsort;
 								            break;
 								        default:
 								            return false;
 								    }
 								    func(ctx, tensor->src[0], tensor->src[1], tensor);
 								    return true;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								////////////////////////////////////////////////////////////////////////////////
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								// backend
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    return cuda_ctx->name.c_str();
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
 								    delete cuda_ctx;
 								    delete backend;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
 								    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
 								    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
 								    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
 								        return false;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    }
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
 								        return false;
 								    }
 								    // device -> device
 								    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
 								    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
 								    if (backend_src != backend_dst) {
 								        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
 								        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
 								        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
 								        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
 								        if (!cuda_ctx_src->copy_event) {
 								            ggml_cuda_set_device(cuda_ctx_src->device);
 								            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
 								        }
 								        // copy on src stream
 								        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								        } else {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								        }
 								        // record event on src stream
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
 								        // wait on dst stream for the copy to complete
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    } else {
 								        // src and dst are on the same backend
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    }
 								    return true;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(backend);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    ggml_cuda_set_device(cuda_ctx->device);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        ggml_tensor * node = cgraph->nodes[i];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 12:16:23 +00:00
+								            continue;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								        }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								#ifndef NDEBUG
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        for (int j = 0; j < GGML_MAX_SRC; j++) {
 								            if (node->src[j] != nullptr) {
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 22:40:26 +00:00
+								                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								            }
 								        }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								#endif
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        if (!ok) {
 								            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
 								        }
 								        GGML_ASSERT(ok);
 								    }
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    return GGML_STATUS_SUCCESS;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    switch (op->op) {
 								        case GGML_OP_UNARY:
 								            switch (ggml_get_unary_op(op)) {
 								                case GGML_UNARY_OP_GELU:
 								                case GGML_UNARY_OP_SILU:
 								                case GGML_UNARY_OP_RELU:
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								                case GGML_UNARY_OP_HARDSIGMOID:
 								                case GGML_UNARY_OP_HARDSWISH:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								                case GGML_UNARY_OP_GELU_QUICK:
 								                case GGML_UNARY_OP_TANH:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								                    return true;
 								                default:
 								                    return false;
 								            }
 								            break;
 								        case GGML_OP_MUL_MAT:
 								        case GGML_OP_MUL_MAT_ID:
 								            {
 								                struct ggml_tensor * a;
 								                struct ggml_tensor * b;
 								                if (op->op == GGML_OP_MUL_MAT) {
 								                    a = op->src[0];
 								                    b = op->src[1];
 								                } else {
 								                    a = op->src[2];
 								                    b = op->src[1];
 								                }
 								                if (a->ne[3] != b->ne[3]) {
 								                    return false;
 								                }
-												ggml : add IQ2 to test-backend-ops + refactoring (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
											
										
										
											2024-01-17 16:54:56 +00:00
+								                ggml_type a_type = a->type;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 09:39:52 +00:00
+								                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 16:28:38 +00:00
+								                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 14:34:24 +00:00
+								                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-												ggml : add IQ2 to test-backend-ops + refactoring (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
											
										
										
											2024-01-17 16:54:56 +00:00
+								                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
 								                        return false;
 								                    }
 								                }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								                return true;
 								            } break;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								        case GGML_OP_GET_ROWS:
 								            {
 								                switch (op->src[0]->type) {
 								                    case GGML_TYPE_F16:
 								                    case GGML_TYPE_F32:
 								                    case GGML_TYPE_Q4_0:
 								                    case GGML_TYPE_Q4_1:
 								                    case GGML_TYPE_Q5_0:
 								                    case GGML_TYPE_Q5_1:
 								                    case GGML_TYPE_Q8_0:
 								                        return true;
 								                    default:
 								                        return false;
 								                }
 								            } break;
 								        case GGML_OP_CPY:
 								            {
 								                ggml_type src0_type = op->src[0]->type;
 								                ggml_type src1_type = op->src[1]->type;
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
 								                    return true;
 								                }
-												Add ability to use Q5_0, Q5_1, and IQ4_NL for quantized K cache (#6183)

* k_cache: be able to use Q5_0

* k_cache: be able to use Q5_1 on CODA

* k_cache: be able to use Q5_0 on Metal

* k_cache: be able to use Q5_1 on Metal

* k_cache: be able to use IQ4_NL - just CUDA for now

* k_cache: be able to use IQ4_NL on Metal

* k_cache: add newly added supported types to llama-bench and CUDA supports_op

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-21 07:27:57 +00:00
+								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
 								                    return true;
 								                }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
 								                    return true;
 								                }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 12:37:33 +00:00
+								                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
 								                    return true;
 								                }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
+								                return false;
 								            } break;
-												cuda : mark I16 and I32 ops as unsupported

ggml-ci

											
										
										
											2024-01-03 11:01:44 +00:00
+								        case GGML_OP_DUP:
 								        case GGML_OP_REPEAT:
 								        case GGML_OP_CONCAT:
 								            {
 								                ggml_type src0_type = op->src[0]->type;
-												cuda : simplify expression

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-03 12:18:46 +00:00
+								                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-												cuda : mark I16 and I32 ops as unsupported

ggml-ci

											
										
										
											2024-01-03 11:01:44 +00:00
+								            } break;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								        case GGML_OP_NONE:
 								        case GGML_OP_RESHAPE:
 								        case GGML_OP_VIEW:
 								        case GGML_OP_PERMUTE:
 								        case GGML_OP_TRANSPOSE:
 								        case GGML_OP_NORM:
 								        case GGML_OP_ADD:
 								        case GGML_OP_MUL:
 								        case GGML_OP_DIV:
 								        case GGML_OP_RMS_NORM:
 								        case GGML_OP_SCALE:
 								        case GGML_OP_SQR:
 								        case GGML_OP_CLAMP:
 								        case GGML_OP_CONT:
 								        case GGML_OP_DIAG_MASK_INF:
 								        case GGML_OP_SOFT_MAX:
 								        case GGML_OP_ROPE:
 								        case GGML_OP_ALIBI:
 								        case GGML_OP_IM2COL:
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 13:10:15 +00:00
+								        case GGML_OP_POOL_2D:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								        case GGML_OP_SUM_ROWS:
 								        case GGML_OP_ARGSORT:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								        case GGML_OP_ACC:
 								        case GGML_OP_GROUP_NORM:
 								        case GGML_OP_UPSCALE:
 								        case GGML_OP_PAD:
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								        case GGML_OP_ARANGE:
 								        case GGML_OP_TIMESTEP_EMBEDDING:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 19:54:54 +00:00
+								        case GGML_OP_LEAKY_RELU:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								            return true;
 								        default:
 								            return false;
 								    }
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(backend);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
 								    const int min_batch_size = 32;
 								    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(backend);
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								}
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
 								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 								    ggml_cuda_set_device(cuda_ctx->device);
 								    cudaEvent_t event;
 								    CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
 								    return new ggml_backend_event {
 								        /* .backend = */ backend,
 								        /* .context = */ event,
 								    };
 								}
 								static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
 								    CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
 								    delete event;
 								}
 								static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
 								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								}
 								static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
 								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 								    if (ggml_backend_is_cuda(event->backend)) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    } else {
-												cuda : disable unused cudaLaunchHostFunc code (#6078)


											
										
										
											2024-03-15 12:24:03 +00:00
+								#if 0
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								        // untested
 								        auto wait_fn = [](void * user_data) {
 								            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
 								            ggml_backend_event_synchronize(event);
 								        };
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								        CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
-												cuda : disable unused cudaLaunchHostFunc code (#6078)


											
										
										
											2024-03-15 12:24:03 +00:00
+								#endif
 								        GGML_ASSERT(false);
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    }
 								}
 								static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
 								    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								static ggml_backend_i ggml_backend_cuda_interface = {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    /* .get_name                = */ ggml_backend_cuda_name,
 								    /* .free                    = */ ggml_backend_cuda_free,
 								    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
 								    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
 								    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    /* .graph_plan_create       = */ NULL,
 								    /* .graph_plan_free         = */ NULL,
 								    /* .graph_plan_compute      = */ NULL,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
 								    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								    /* .offload_op              = */ ggml_backend_cuda_offload_op,
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    /* .event_new               = */ ggml_backend_cuda_event_new,
 								    /* .event_free              = */ ggml_backend_cuda_event_free,
 								    /* .event_record            = */ ggml_backend_cuda_event_record,
 								    /* .event_wait              = */ ggml_backend_cuda_event_wait,
 								    /* .event_synchronize       = */ ggml_backend_cuda_event_synchronize,
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								};
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 16:27:36 +00:00
+								static ggml_guid_t ggml_backend_cuda_guid() {
 								    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
 								    return &guid;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
 								        return nullptr;
 								    }
-												llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-03-13 17:54:21 +00:00
+								    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
 								    if (ctx == nullptr) {
 								        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
 								        return nullptr;
 								    }
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
 								    ggml_backend_t cuda_backend = new ggml_backend {
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 16:27:36 +00:00
+								        /* .guid      = */ ggml_backend_cuda_guid(),
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								        /* .interface = */ ggml_backend_cuda_interface,
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 17:19:14 +00:00
+								        /* .context   = */ ctx
 								    };
 								    return cuda_backend;
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 16:27:36 +00:00
+								    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL int ggml_backend_cuda_get_device_count() {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    return get_cuda_global_info().device_count;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    cudaDeviceProp prop;
 								    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
 								    snprintf(description, description_size, "%s", prop.name);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								    ggml_cuda_set_device(device);
 								    CUDA_CHECK(cudaMemGetInfo(free, total));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
-												cuda : disable host register by default (#6206)


											
										
										
											2024-03-21 18:54:28 +00:00
+								    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								        return false;
 								    }
 								    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
 								    if (err != cudaSuccess) {
 								        // clear the error
 								        cudaGetLastError();
 								        fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
 								                size/1024.0/1024.0, cudaGetErrorString(err));
 								        return false;
 								    }
 								    return true;
 								}
 								GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
-												cuda : disable host register by default (#6206)


											
										
										
											2024-03-21 18:54:28 +00:00
+								    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
 								        return;
 								    }
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 10:03:04 +00:00
+								    cudaError_t err = cudaHostUnregister(buffer);
 								    if (err != cudaSuccess) {
 								        // clear the error
 								        cudaGetLastError();
 								    }
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 19:07:38 +00:00
+								// backend registry
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
 								    return cuda_backend;
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    GGML_UNUSED(params);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 12:04:25 +00:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL int ggml_backend_cuda_reg_devices() {
-												cuda : refactor to remove global resources (#6170)

* cuda : refactor to remove global resources

											
										
										
											2024-03-20 13:42:59 +00:00
+								    int device_count = ggml_backend_cuda_get_device_count();
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 20:26:54 +00:00
+								    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
 								    for (int i = 0; i < device_count; i++) {
 								        char name[128];
 								        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
 								        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
 								    }
 								    return device_count;
 								}