2024-02-22 21:21:39 +00:00
|
|
|
#include "ggml-cuda.h"
|
|
|
|
#include "ggml.h"
|
|
|
|
#include "ggml-backend-impl.h"
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
#include "ggml-cuda/common.cuh"
|
|
|
|
#include "ggml-cuda/acc.cuh"
|
|
|
|
#include "ggml-cuda/alibi.cuh"
|
|
|
|
#include "ggml-cuda/arange.cuh"
|
|
|
|
#include "ggml-cuda/argsort.cuh"
|
|
|
|
#include "ggml-cuda/binbcast.cuh"
|
|
|
|
#include "ggml-cuda/clamp.cuh"
|
|
|
|
#include "ggml-cuda/concat.cuh"
|
|
|
|
#include "ggml-cuda/convert.cuh"
|
|
|
|
#include "ggml-cuda/cpy.cuh"
|
|
|
|
#include "ggml-cuda/diagmask.cuh"
|
|
|
|
#include "ggml-cuda/dmmv.cuh"
|
|
|
|
#include "ggml-cuda/getrows.cuh"
|
|
|
|
#include "ggml-cuda/im2col.cuh"
|
|
|
|
#include "ggml-cuda/mmq.cuh"
|
|
|
|
#include "ggml-cuda/mmvq.cuh"
|
|
|
|
#include "ggml-cuda/norm.cuh"
|
|
|
|
#include "ggml-cuda/pad.cuh"
|
|
|
|
#include "ggml-cuda/pool2d.cuh"
|
|
|
|
#include "ggml-cuda/quantize.cuh"
|
|
|
|
#include "ggml-cuda/rope.cuh"
|
|
|
|
#include "ggml-cuda/scale.cuh"
|
|
|
|
#include "ggml-cuda/softmax.cuh"
|
|
|
|
#include "ggml-cuda/sumrows.cuh"
|
|
|
|
#include "ggml-cuda/tsembd.cuh"
|
|
|
|
#include "ggml-cuda/unary.cuh"
|
|
|
|
#include "ggml-cuda/upscale.cuh"
|
2024-03-09 10:47:57 +00:00
|
|
|
|
2023-09-28 19:42:38 +00:00
|
|
|
#include <algorithm>
|
2024-03-20 13:42:59 +00:00
|
|
|
#include <array>
|
2023-12-13 12:04:25 +00:00
|
|
|
#include <atomic>
|
|
|
|
#include <cinttypes>
|
2023-05-01 16:11:07 +00:00
|
|
|
#include <cstddef>
|
|
|
|
#include <cstdint>
|
2023-12-07 20:26:54 +00:00
|
|
|
#include <float.h>
|
2023-06-14 17:47:19 +00:00
|
|
|
#include <limits>
|
2024-03-20 13:42:59 +00:00
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
#include <mutex>
|
2023-04-20 01:14:14 +00:00
|
|
|
#include <stdint.h>
|
2023-04-21 19:59:17 +00:00
|
|
|
#include <stdio.h>
|
2024-01-12 19:07:38 +00:00
|
|
|
#include <string>
|
2023-12-13 12:04:25 +00:00
|
|
|
#include <vector>
|
2023-04-20 01:14:14 +00:00
|
|
|
|
2023-05-01 16:11:07 +00:00
|
|
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
|
|
|
2023-12-26 20:23:59 +00:00
|
|
|
[[noreturn]]
|
2024-03-25 12:50:23 +00:00
|
|
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
2023-12-26 20:23:59 +00:00
|
|
|
int id = -1; // in case cudaGetDevice fails
|
|
|
|
cudaGetDevice(&id);
|
|
|
|
|
|
|
|
fprintf(stderr, "CUDA error: %s\n", msg);
|
|
|
|
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
|
|
fprintf(stderr, " %s\n", stmt);
|
|
|
|
// abort with GGML_ASSERT to get a stack trace
|
|
|
|
GGML_ASSERT(!"CUDA error");
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// this is faster on Windows
|
|
|
|
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
2024-03-25 12:50:23 +00:00
|
|
|
void ggml_cuda_set_device(int device) {
|
2024-03-20 13:42:59 +00:00
|
|
|
int current_device;
|
|
|
|
CUDA_CHECK(cudaGetDevice(¤t_device));
|
2023-07-29 21:04:44 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (device == current_device) {
|
|
|
|
return;
|
|
|
|
}
|
2023-07-29 21:04:44 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaSetDevice(device));
|
2023-07-29 21:04:44 +00:00
|
|
|
}
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
int ggml_cuda_get_device() {
|
2024-03-20 13:42:59 +00:00
|
|
|
int id;
|
|
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
|
|
return id;
|
2023-07-29 21:04:44 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
|
#ifdef __HIP_PLATFORM_AMD__
|
|
|
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
|
|
|
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
|
|
|
rocblas_initialize();
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
#endif
|
2023-05-25 21:07:29 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_device_info info = {};
|
2023-05-25 21:07:29 +00:00
|
|
|
|
2024-03-20 20:03:26 +00:00
|
|
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
2024-03-20 13:42:59 +00:00
|
|
|
return info;
|
|
|
|
}
|
2024-01-12 19:38:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t total_vram = 0;
|
|
|
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
|
|
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
2023-06-16 17:08:44 +00:00
|
|
|
#else
|
2024-03-20 13:42:59 +00:00
|
|
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
|
|
|
#endif
|
|
|
|
#if defined(CUDA_USE_TENSOR_CORES)
|
|
|
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
|
|
|
#else
|
|
|
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
2023-06-16 17:08:44 +00:00
|
|
|
#endif
|
2024-03-20 13:42:59 +00:00
|
|
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
|
|
|
for (int id = 0; id < info.device_count; ++id) {
|
|
|
|
int device_vmm = 0;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
|
|
CUdevice device;
|
|
|
|
CU_CHECK(cuDeviceGet(&device, id));
|
|
|
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
2023-09-17 14:37:53 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (device_vmm) {
|
|
|
|
CUmemAllocationProp alloc_prop = {};
|
|
|
|
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
|
|
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
alloc_prop.location.id = id;
|
|
|
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
|
|
|
}
|
|
|
|
#endif // !defined(GGML_USE_HIPBLAS)
|
|
|
|
info.devices[id].vmm = !!device_vmm;
|
2023-09-11 17:55:51 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
cudaDeviceProp prop;
|
|
|
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
|
|
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
2023-09-11 17:55:51 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
info.default_tensor_split[id] = total_vram;
|
|
|
|
total_vram += prop.totalGlobalMem;
|
2023-07-01 19:49:44 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
|
|
|
#else
|
|
|
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
|
|
}
|
2023-09-11 17:55:51 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < info.device_count; ++id) {
|
|
|
|
info.default_tensor_split[id] /= total_vram;
|
2023-09-11 17:55:51 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// configure logging to stdout
|
|
|
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
|
|
|
|
|
|
|
return info;
|
2023-09-11 17:55:51 +00:00
|
|
|
}
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const ggml_cuda_device_info & ggml_cuda_info() {
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_cuda_device_info info = ggml_cuda_init();
|
|
|
|
return info;
|
|
|
|
}
|
2023-08-09 07:42:34 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// #define DEBUG_CUDA_MALLOC
|
2023-12-24 13:34:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// buffer pool for cuda (legacy)
|
|
|
|
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
|
|
static const int MAX_BUFFERS = 256;
|
2023-12-21 17:02:30 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int device;
|
|
|
|
struct ggml_cuda_buffer {
|
|
|
|
void * ptr = nullptr;
|
|
|
|
size_t size = 0;
|
|
|
|
};
|
2023-12-21 17:02:30 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
|
|
|
|
size_t pool_size = 0;
|
2024-01-23 12:31:56 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
explicit ggml_cuda_pool_leg(int device) :
|
|
|
|
device(device) {
|
2023-12-01 08:51:24 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
~ggml_cuda_pool_leg() {
|
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
|
|
ggml_cuda_buffer & b = buffer_pool[i];
|
|
|
|
if (b.ptr != nullptr) {
|
|
|
|
CUDA_CHECK(cudaFree(b.ptr));
|
|
|
|
pool_size -= b.size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
GGML_ASSERT(pool_size == 0);
|
2023-12-01 08:51:24 +00:00
|
|
|
}
|
2024-01-09 07:58:55 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
int nnz = 0;
|
|
|
|
size_t max_size = 0;
|
|
|
|
#endif
|
|
|
|
size_t best_diff = 1ull << 36;
|
|
|
|
int ibest = -1;
|
|
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
|
|
ggml_cuda_buffer& b = buffer_pool[i];
|
|
|
|
if (b.ptr != nullptr) {
|
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
++nnz;
|
|
|
|
if (b.size > max_size) max_size = b.size;
|
|
|
|
#endif
|
|
|
|
if (b.size >= size) {
|
|
|
|
size_t diff = b.size - size;
|
|
|
|
if (diff < best_diff) {
|
|
|
|
best_diff = diff;
|
|
|
|
ibest = i;
|
|
|
|
if (!best_diff) {
|
|
|
|
void * ptr = b.ptr;
|
|
|
|
*actual_size = b.size;
|
|
|
|
b.ptr = nullptr;
|
|
|
|
b.size = 0;
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ibest >= 0) {
|
|
|
|
ggml_cuda_buffer& b = buffer_pool[ibest];
|
|
|
|
void * ptr = b.ptr;
|
|
|
|
*actual_size = b.size;
|
|
|
|
b.ptr = nullptr;
|
|
|
|
b.size = 0;
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
void * ptr;
|
|
|
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
|
|
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
|
|
|
*actual_size = look_ahead_size;
|
|
|
|
pool_size += look_ahead_size;
|
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
|
|
|
(uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
|
|
|
|
#endif
|
|
|
|
return ptr;
|
2023-12-01 08:51:24 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void free(void * ptr, size_t size) override {
|
|
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
|
|
ggml_cuda_buffer& b = buffer_pool[i];
|
|
|
|
if (b.ptr == nullptr) {
|
|
|
|
b.ptr = ptr;
|
|
|
|
b.size = size;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
CUDA_CHECK(cudaFree(ptr));
|
|
|
|
pool_size -= size;
|
|
|
|
}
|
|
|
|
};
|
2024-01-09 07:58:55 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// pool with virtual memory
|
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
|
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
|
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
2023-06-06 19:33:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int device;
|
|
|
|
CUdeviceptr pool_addr = 0;
|
|
|
|
size_t pool_used = 0;
|
|
|
|
size_t pool_size = 0;
|
|
|
|
size_t granularity;
|
2023-06-06 19:33:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
explicit ggml_cuda_pool_vmm(int device) :
|
|
|
|
device(device),
|
2024-03-25 12:50:23 +00:00
|
|
|
granularity(ggml_cuda_info().devices[device].vmm_granularity) {
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2023-06-28 16:35:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
~ggml_cuda_pool_vmm() {
|
|
|
|
if (pool_addr != 0) {
|
|
|
|
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
|
|
|
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
|
|
|
}
|
|
|
|
}
|
2023-06-28 16:35:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
|
|
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
|
|
|
const size_t alignment = 128;
|
|
|
|
size = alignment * ((size + alignment - 1) / alignment);
|
2023-11-01 11:49:04 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
size_t avail = pool_size - pool_used;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (size > avail) {
|
|
|
|
// round up to the next multiple of the granularity
|
|
|
|
size_t reserve_size = size - avail;
|
|
|
|
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// allocate more physical memory
|
|
|
|
CUmemAllocationProp prop = {};
|
|
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
prop.location.id = device;
|
|
|
|
CUmemGenericAllocationHandle handle;
|
|
|
|
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// reserve virtual address space (if not already reserved)
|
|
|
|
if (pool_addr == 0) {
|
|
|
|
CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
|
|
|
}
|
2023-11-01 11:49:04 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// map at the end of the pool
|
|
|
|
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// the memory allocation handle is no longer needed after mapping
|
|
|
|
CU_CHECK(cuMemRelease(handle));
|
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 12:19:28 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// set access
|
|
|
|
CUmemAccessDesc access = {};
|
|
|
|
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
access.location.id = device;
|
|
|
|
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
|
|
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// add to the pool
|
|
|
|
pool_size += reserve_size;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
2024-03-25 12:50:23 +00:00
|
|
|
// device, (unsigned long long) (pool_size/1024/1024),
|
2024-03-20 13:42:59 +00:00
|
|
|
// (unsigned long long) (reserve_size/1024/1024));
|
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(pool_addr != 0);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * ptr = (void *) (pool_addr + pool_used);
|
|
|
|
*actual_size = size;
|
|
|
|
pool_used += size;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
|
|
|
#endif
|
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 12:19:28 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return ptr;
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void free(void * ptr, size_t size) override {
|
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
|
|
|
#endif
|
2023-07-12 17:26:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
pool_used -= size;
|
|
|
|
|
|
|
|
// all deallocations must be in reverse order of the allocations
|
|
|
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
2023-07-12 17:26:18 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
};
|
|
|
|
#endif // !defined(GGML_USE_HIPBLAS)
|
2023-07-12 17:26:18 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
2024-03-20 13:42:59 +00:00
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
2024-03-25 12:50:23 +00:00
|
|
|
if (ggml_cuda_info().devices[device].vmm) {
|
|
|
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
2023-07-11 19:53:34 +00:00
|
|
|
}
|
2024-03-25 12:50:23 +00:00
|
|
|
#endif
|
|
|
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
|
|
|
}
|
2023-07-11 19:53:34 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// cuda buffer
|
|
|
|
|
|
|
|
struct ggml_backend_cuda_buffer_context {
|
|
|
|
int device;
|
|
|
|
void * dev_ptr = nullptr;
|
|
|
|
std::string name;
|
|
|
|
|
|
|
|
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
|
|
|
device(device), dev_ptr(dev_ptr),
|
|
|
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
|
|
|
|
~ggml_backend_cuda_buffer_context() {
|
|
|
|
CUDA_CHECK(cudaFree(dev_ptr));
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
return ctx->name.c_str();
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
|
|
|
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
delete ctx;
|
|
|
|
}
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
return ctx->dev_ptr;
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
if (tensor->view_src != NULL) {
|
2024-03-20 13:42:59 +00:00
|
|
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
2024-03-03 12:23:52 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
|
|
// initialize padding to 0 to avoid possible NaN values
|
|
|
|
size_t original_size = ggml_nbytes(tensor);
|
|
|
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
2024-03-03 12:23:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
|
|
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
|
|
|
}
|
2024-03-03 12:23:52 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-03-03 12:23:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
2024-03-03 12:23:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
2024-03-03 12:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
|
|
}
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
|
|
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
|
|
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
|
|
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
|
|
|
if (src_ctx->device == dst_ctx->device) {
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
|
|
|
} else {
|
2024-03-22 13:05:31 +00:00
|
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
|
|
return false;
|
|
|
|
#else
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
2024-03-22 13:05:31 +00:00
|
|
|
#endif
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
|
|
return true;
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
return false;
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
}
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
}
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
|
|
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
|
|
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
|
|
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
|
|
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
|
|
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
|
|
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
|
|
|
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
|
|
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
|
|
|
/* .reset = */ NULL,
|
|
|
|
};
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// cuda buffer type
|
|
|
|
struct ggml_backend_cuda_buffer_type_context {
|
|
|
|
int device;
|
|
|
|
std::string name;
|
|
|
|
};
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
|
|
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
2023-12-13 19:54:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return ctx->name.c_str();
|
2023-12-13 19:54:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
2023-06-06 19:33:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(buft_ctx->device);
|
2023-06-06 19:33:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
2023-06-06 19:33:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * dev_ptr;
|
|
|
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
|
|
|
return nullptr;
|
2023-06-06 19:33:23 +00:00
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
|
return 128;
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
2023-05-13 13:38:36 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
|
|
size_t size = ggml_nbytes(tensor);
|
|
|
|
int64_t ne0 = tensor->ne[0];
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
}
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return size;
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
2023-05-13 13:38:36 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
|
|
if (!ggml_backend_is_cuda(backend)) {
|
|
|
|
return false;
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return buft_ctx->device == cuda_ctx->device;
|
2023-05-13 13:38:36 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
|
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
|
|
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
|
|
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
|
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
|
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
|
|
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
|
|
/* .is_host = */ NULL,
|
|
|
|
};
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
|
|
static std::mutex mutex;
|
|
|
|
std::lock_guard<std::mutex> lock(mutex);
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (device >= ggml_backend_cuda_get_device_count()) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (!ggml_backend_cuda_buffer_type_initialized) {
|
|
|
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
|
|
|
ggml_backend_cuda_buffer_types[i] = {
|
|
|
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
|
|
|
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
|
|
|
};
|
|
|
|
}
|
|
|
|
ggml_backend_cuda_buffer_type_initialized = true;
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return &ggml_backend_cuda_buffer_types[device];
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// cuda split buffer
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
|
|
|
int64_t min_compute_capability = INT_MAX;
|
|
|
|
int64_t max_compute_capability = INT_MIN;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
2024-03-25 12:50:23 +00:00
|
|
|
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
|
|
|
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-03-25 12:50:23 +00:00
|
|
|
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
|
|
|
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-05-13 13:38:36 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
switch(type) {
|
|
|
|
case GGML_TYPE_Q4_0:
|
|
|
|
case GGML_TYPE_Q4_1:
|
|
|
|
case GGML_TYPE_Q5_0:
|
|
|
|
case GGML_TYPE_Q5_1:
|
|
|
|
case GGML_TYPE_Q8_0:
|
|
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
|
|
case GGML_TYPE_F16:
|
|
|
|
case GGML_TYPE_F32:
|
|
|
|
return 1;
|
|
|
|
case GGML_TYPE_Q2_K:
|
|
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
|
|
|
case GGML_TYPE_Q3_K:
|
|
|
|
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
|
|
|
case GGML_TYPE_Q4_K:
|
|
|
|
case GGML_TYPE_Q5_K:
|
|
|
|
case GGML_TYPE_Q6_K:
|
|
|
|
case GGML_TYPE_IQ2_XXS:
|
|
|
|
case GGML_TYPE_IQ2_XS:
|
|
|
|
case GGML_TYPE_IQ2_S:
|
|
|
|
case GGML_TYPE_IQ3_XXS:
|
|
|
|
case GGML_TYPE_IQ1_S:
|
2024-03-26 14:21:27 +00:00
|
|
|
case GGML_TYPE_IQ1_M:
|
2024-03-20 13:42:59 +00:00
|
|
|
case GGML_TYPE_IQ4_NL:
|
|
|
|
case GGML_TYPE_IQ4_XS:
|
|
|
|
case GGML_TYPE_IQ3_S:
|
|
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
|
|
default:
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
}
|
2023-06-19 08:23:56 +00:00
|
|
|
#else
|
2024-03-20 13:42:59 +00:00
|
|
|
switch(type) {
|
|
|
|
case GGML_TYPE_Q4_0:
|
|
|
|
case GGML_TYPE_Q4_1:
|
|
|
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
|
|
case GGML_TYPE_Q5_0:
|
|
|
|
case GGML_TYPE_Q5_1:
|
|
|
|
case GGML_TYPE_Q8_0:
|
|
|
|
return 64;
|
|
|
|
case GGML_TYPE_F16:
|
|
|
|
case GGML_TYPE_F32:
|
|
|
|
return 1;
|
|
|
|
case GGML_TYPE_Q2_K:
|
|
|
|
case GGML_TYPE_Q3_K:
|
|
|
|
case GGML_TYPE_Q4_K:
|
|
|
|
case GGML_TYPE_Q5_K:
|
|
|
|
case GGML_TYPE_IQ2_XXS:
|
|
|
|
case GGML_TYPE_IQ2_XS:
|
|
|
|
case GGML_TYPE_IQ2_S:
|
|
|
|
case GGML_TYPE_IQ3_XXS:
|
|
|
|
case GGML_TYPE_IQ1_S:
|
2024-03-26 14:21:27 +00:00
|
|
|
case GGML_TYPE_IQ1_M:
|
2024-03-20 13:42:59 +00:00
|
|
|
case GGML_TYPE_IQ4_NL:
|
|
|
|
case GGML_TYPE_IQ4_XS:
|
|
|
|
case GGML_TYPE_IQ3_S:
|
|
|
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
|
|
case GGML_TYPE_Q6_K:
|
|
|
|
return 64;
|
|
|
|
default:
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
}
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2023-05-13 13:38:36 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
|
|
|
const int64_t nrows = ggml_nrows(tensor);
|
|
|
|
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
|
|
|
*row_low -= *row_low % rounding;
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (id == ggml_backend_cuda_get_device_count() - 1) {
|
|
|
|
*row_high = nrows;
|
|
|
|
} else {
|
|
|
|
*row_high = nrows*tensor_split[id + 1];
|
|
|
|
*row_high -= *row_high % rounding;
|
2024-01-15 05:48:06 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
|
|
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
struct ggml_backend_cuda_split_buffer_type_context {
|
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
|
|
};
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
struct ggml_backend_cuda_split_buffer_context {
|
|
|
|
~ggml_backend_cuda_split_buffer_context() {
|
|
|
|
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
|
|
|
for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
|
|
|
|
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
|
|
|
if (extra->events[id][is] != nullptr) {
|
|
|
|
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (extra->data_device[id] != nullptr) {
|
|
|
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete extra;
|
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
|
|
|
};
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
|
|
return GGML_CUDA_NAME "_Split";
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|
|
|
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
|
|
|
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
|
|
delete ctx;
|
|
|
|
}
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
|
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
|
|
|
return (void *)0x1000;
|
2024-01-15 05:48:06 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buffer);
|
2024-01-15 05:48:06 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
|
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne0 = tensor->ne[0];
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
|
|
ctx->tensor_extras.push_back(extra);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
int64_t row_low, row_high;
|
|
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
|
|
if (nrows_split == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
|
|
const size_t original_size = size;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// FIXME: do not crash if cudaMalloc fails
|
|
|
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
|
|
|
ggml_cuda_set_device(id);
|
|
|
|
char * buf;
|
|
|
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// set padding to 0 to avoid possible NaN values
|
|
|
|
if (size > original_size) {
|
|
|
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
|
|
|
}
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
extra->data_device[id] = buf;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
|
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tensor->extra = extra;
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
// split tensors must always be set in their entirety at once
|
|
|
|
GGML_ASSERT(offset == 0);
|
|
|
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
|
|
const size_t nb1 = tensor->nb[1];
|
|
|
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
int64_t row_low, row_high;
|
|
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
|
|
if (nrows_split == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const size_t offset_split = row_low*nb1;
|
|
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
|
|
const size_t original_size = size;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char * buf_host = (const char *)data + offset_split;
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
// split tensors must always be set in their entirety at once
|
|
|
|
GGML_ASSERT(offset == 0);
|
|
|
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
|
|
const size_t nb1 = tensor->nb[1];
|
|
|
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
int64_t row_low, row_high;
|
|
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
|
|
if (nrows_split == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const size_t offset_split = row_low*nb1;
|
|
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
|
|
const size_t original_size = size;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
}
|
|
|
|
|
|
|
|
char * buf_host = (char *)data + offset_split;
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
GGML_UNUSED(value);
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
|
|
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
|
|
|
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
|
|
|
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
|
|
|
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
|
|
|
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
|
|
|
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
|
|
|
/* .cpy_tensor = */ NULL,
|
|
|
|
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
|
|
|
/* .reset = */ NULL,
|
|
|
|
};
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// cuda split buffer type
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
|
|
return GGML_CUDA_NAME "_Split";
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
|
|
|
// instead, we allocate them for each tensor separately in init_tensor
|
|
|
|
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
|
|
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
|
|
|
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
|
return 128;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
|
|
|
}
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
size_t total_size = 0;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne0 = tensor->ne[0];
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
int64_t row_low, row_high;
|
|
|
|
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
|
|
if (nrows_split == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
total_size += ggml_nbytes_split(tensor, nrows_split);
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
}
|
|
|
|
}
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return total_size;
|
|
|
|
}
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
|
|
return ggml_backend_is_cuda(backend);
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 19:56:18 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
|
|
return false;
|
|
|
|
|
|
|
|
GGML_UNUSED(buft);
|
2024-01-08 15:02:32 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
|
|
|
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
|
|
|
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
|
|
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
|
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
|
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
|
|
|
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
|
|
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
|
|
|
};
|
2024-01-08 15:02:32 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
|
|
|
static std::mutex mutex;
|
|
|
|
std::lock_guard<std::mutex> lock(mutex);
|
2024-01-08 15:02:32 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
2024-01-08 15:02:32 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
2024-01-08 15:02:32 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
|
|
|
if (all_zero) {
|
2024-03-25 12:50:23 +00:00
|
|
|
tensor_split_arr = ggml_cuda_info().default_tensor_split;
|
2024-03-20 13:42:59 +00:00
|
|
|
} else {
|
|
|
|
float split_sum = 0.0f;
|
|
|
|
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
|
|
|
tensor_split_arr[i] = split_sum;
|
|
|
|
split_sum += tensor_split[i];
|
|
|
|
}
|
|
|
|
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
|
|
|
tensor_split_arr[i] /= split_sum;
|
|
|
|
}
|
|
|
|
}
|
2024-01-11 19:39:39 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
auto it = buft_map.find(tensor_split_arr);
|
|
|
|
if (it != buft_map.end()) {
|
|
|
|
return &it->second;
|
|
|
|
}
|
2024-01-11 19:39:39 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
struct ggml_backend_buffer_type buft {
|
|
|
|
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
|
|
|
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
|
|
|
};
|
2024-01-11 19:39:39 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
auto result = buft_map.emplace(tensor_split_arr, buft);
|
|
|
|
return &result.first->second;
|
2024-01-11 19:39:39 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// host buffer type
|
2024-02-26 16:28:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
|
|
return GGML_CUDA_NAME "_Host";
|
2024-02-26 16:28:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buft);
|
2024-02-26 16:28:38 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
|
|
return GGML_CUDA_NAME "_Host";
|
2024-01-30 13:14:12 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
}
|
2024-01-30 13:14:12 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
2024-01-30 13:14:12 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void * ggml_cuda_host_malloc(size_t size) {
|
|
|
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
2024-02-24 14:23:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * ptr = nullptr;
|
|
|
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
// clear the error
|
|
|
|
cudaGetLastError();
|
|
|
|
fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
|
|
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
|
|
return nullptr;
|
2024-02-24 14:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return ptr;
|
2024-02-24 14:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
void * ptr = ggml_cuda_host_malloc(size);
|
2024-02-18 16:16:55 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (ptr == nullptr) {
|
|
|
|
// fallback to cpu buffer
|
|
|
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
2024-03-11 06:51:49 +00:00
|
|
|
}
|
2024-02-18 16:16:55 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
|
|
buffer->buft = buft;
|
|
|
|
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
|
|
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
|
|
|
|
|
|
|
return buffer;
|
2024-02-18 16:16:55 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
|
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
|
|
|
/* .iface = */ {
|
|
|
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
|
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
|
|
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
|
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
|
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
|
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
|
|
},
|
|
|
|
/* .context = */ nullptr,
|
|
|
|
};
|
2024-02-21 09:39:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return &ggml_backend_cuda_buffer_type_host;
|
|
|
|
}
|
2024-02-21 09:39:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
|
|
|
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
|
|
|
//}
|
2024-02-21 09:39:52 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
/// kernels
|
|
|
|
|
|
|
|
typedef void (*ggml_cuda_op_mul_mat_t)(
|
|
|
|
ggml_backend_cuda_context & ctx,
|
|
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
|
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
|
|
const int64_t src1_padded_row_size, cudaStream_t stream);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
|
|
|
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
|
|
|
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static __global__ void mul_mat_p021_f16_f32(
|
|
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
|
|
|
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const half * x = (const half *) vx;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
|
|
const int channel_x = channel / (nchannels_y / nchannels_x);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int nrows_y = ncols_x;
|
|
|
|
const int nrows_dst = nrows_x;
|
|
|
|
const int row_dst = row_x;
|
2024-03-20 13:42:59 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
float tmp = 0.0f;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
|
|
|
const int col_x = col_x0 + threadIdx.x;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
if (col_x >= ncols_x) {
|
|
|
|
break;
|
|
|
|
}
|
2023-06-19 15:14:09 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
// x is transposed and permuted
|
|
|
|
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
|
|
|
const float xi = __half2float(x[ix]);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int row_y = col_x;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
// y is not transposed but permuted
|
|
|
|
const int iy = channel*nrows_y + row_y;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
tmp += xi * y[iy];
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
// dst is not transposed and not permuted
|
|
|
|
const int idst = channel*nrows_dst + row_dst;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
// sum up partial sums and write back result
|
|
|
|
tmp = warp_reduce_sum(tmp);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
dst[idst] = tmp;
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
}
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
|
|
|
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const half * x = (const half *) vx;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
|
|
const int channel_x = channel / channel_x_divisor;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int nrows_y = ncols_x;
|
|
|
|
const int nrows_dst = nrows_x;
|
|
|
|
const int row_dst = row_x;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int idst = channel*nrows_dst + row_dst;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
float tmp = 0.0f;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
|
|
|
const int col_x = col_x0 + threadIdx.x;
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
if (col_x >= ncols_x) {
|
|
|
|
break;
|
|
|
|
}
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int row_y = col_x;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
|
|
|
const int iy = channel*nrows_y + row_y;
|
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 16:43:07 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const float xi = __half2float(x[ix]);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
tmp += xi * y[iy];
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2023-06-19 15:14:09 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
// sum up partial sums and write back result
|
|
|
|
tmp = warp_reduce_sum(tmp);
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
dst[idst] = tmp;
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
}
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static void ggml_mul_mat_p021_f16_f32_cuda(
|
|
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
|
|
|
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
|
|
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2023-06-16 17:08:44 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
|
|
|
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2023-07-23 05:49:20 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
|
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
|
|
|
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2023-06-14 17:47:19 +00:00
|
|
|
}
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
|
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
|
|
|
|
char * src_ptr = (char *) src->data;
|
|
|
|
char * dst_ptr = (char *) dst;
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int64_t ne0 = src->ne[0];
|
|
|
|
const int64_t nb0 = src->nb[0];
|
|
|
|
const int64_t nb1 = src->nb[1];
|
|
|
|
const int64_t nb2 = src->nb[2];
|
|
|
|
const int64_t nb3 = src->nb[3];
|
|
|
|
const enum ggml_type type = src->type;
|
|
|
|
const int64_t ts = ggml_type_size(type);
|
|
|
|
const int64_t bs = ggml_blck_size(type);
|
|
|
|
int64_t i1_diff = i1_high - i1_low;
|
|
|
|
|
|
|
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
|
|
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
|
|
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
|
|
|
|
} else if (nb0 == ts) {
|
|
|
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
|
|
|
|
} else {
|
|
|
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
|
|
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
|
|
|
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
|
|
|
// pretend the row is a matrix with cols=1
|
|
|
|
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
|
|
|
|
if (r != cudaSuccess) {
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return cudaSuccess;
|
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_op_mul_mat_cublas(
|
|
|
|
ggml_backend_cuda_context & ctx,
|
|
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
|
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
|
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(src0_dd_i != nullptr);
|
|
|
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
|
|
|
GGML_ASSERT(dst_dd_i != nullptr);
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
const int64_t ne10 = src1->ne[0];
|
2023-11-01 22:10:09 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne0 = dst->ne[0];
|
2023-11-01 22:10:09 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t row_diff = row_high - row_low;
|
2023-11-01 22:10:09 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int id = ggml_cuda_get_device();
|
2023-11-01 22:10:09 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
|
|
// ldc == nrows of the matrix that cuBLAS writes into
|
2024-04-09 08:16:13 +00:00
|
|
|
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
2023-10-27 14:01:23 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
2024-04-18 13:18:48 +00:00
|
|
|
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src0->type != GGML_TYPE_F16) {
|
|
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
|
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
|
|
size_t ne = row_diff*ne00;
|
|
|
|
src0_as_f16.alloc(ne);
|
|
|
|
to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
|
|
|
|
}
|
|
|
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src1->type != GGML_TYPE_F16) {
|
|
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
|
|
size_t ne = src1_ncols*ne10;
|
|
|
|
src1_as_f16.alloc(ne);
|
|
|
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
|
|
|
}
|
|
|
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
2024-04-18 13:18:48 +00:00
|
|
|
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const half alpha_f16 = 1.0f;
|
|
|
|
const half beta_f16 = 0.0f;
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
|
|
|
CUBLAS_CHECK(
|
|
|
|
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
|
|
row_diff, src1_ncols, ne10,
|
|
|
|
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
|
|
|
src1_ptr, CUDA_R_16F, ne10,
|
|
|
|
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
|
|
|
CUBLAS_COMPUTE_16F,
|
|
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
|
|
|
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
|
|
} else {
|
|
|
|
ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
|
|
|
|
ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src0->type != GGML_TYPE_F32) {
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
|
|
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
|
|
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
|
|
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
|
|
|
}
|
|
|
|
if (src1->type != GGML_TYPE_F32) {
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
|
|
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
|
|
|
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
|
|
|
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
|
|
|
}
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
|
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const float alpha = 1.0f;
|
|
|
|
const float beta = 0.0f;
|
2023-10-24 13:48:37 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
|
|
|
CUBLAS_CHECK(
|
|
|
|
cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
|
|
row_diff, src1_ncols, ne10,
|
|
|
|
&alpha, src0_ddf_i, ne00,
|
|
|
|
src1_ddf1_i, ne10,
|
|
|
|
&beta, dst_dd_i, ldc));
|
2023-12-29 08:32:31 +00:00
|
|
|
}
|
2023-12-18 17:27:47 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(dst);
|
|
|
|
GGML_UNUSED(src1_ddq_i);
|
|
|
|
GGML_UNUSED(src1_padded_row_size);
|
|
|
|
}
|
2023-12-18 17:27:47 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
|
|
static bool peer_access_enabled = false;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
|
|
|
|
|
|
|
if (peer_access_enabled == enable_peer_access) {
|
|
|
|
return;
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
|
|
|
|
#ifdef NDEBUG
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
ggml_cuda_set_device(id);
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
ggml_cuda_set_device(id);
|
|
|
|
|
|
|
|
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
|
|
|
|
if (id == id_other) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (id != main_device && id_other != main_device) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int can_access_peer;
|
|
|
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
|
|
|
if (can_access_peer) {
|
|
|
|
if (enable_peer_access) {
|
|
|
|
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
|
|
|
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
|
|
|
CUDA_CHECK(err);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
|
|
|
if (err != cudaErrorPeerAccessNotEnabled) {
|
|
|
|
CUDA_CHECK(err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
2024-03-25 12:50:23 +00:00
|
|
|
|
|
|
|
ggml_cuda_set_device(main_device);
|
2024-03-20 13:42:59 +00:00
|
|
|
#endif // NDEBUG
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
peer_access_enabled = enable_peer_access;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(main_device);
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_op_mul_mat(
|
|
|
|
ggml_backend_cuda_context & ctx,
|
|
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
|
|
|
const bool convert_src1_to_q8_1) {
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
const int64_t ne01 = src0->ne[1];
|
|
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
const int64_t ne03 = src0->ne[3];
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne10 = src1->ne[0];
|
|
|
|
const int64_t ne11 = src1->ne[1];
|
|
|
|
const int64_t ne12 = src1->ne[2];
|
|
|
|
const int64_t ne13 = src1->ne[3];
|
|
|
|
const int64_t nrows1 = ggml_nrows(src1);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(ne03 == ne13);
|
|
|
|
|
|
|
|
const int64_t ne0 = dst->ne[0];
|
|
|
|
const int64_t ne1 = dst->ne[1];
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-04-09 08:16:13 +00:00
|
|
|
const int64_t nb2 = dst->nb[2];
|
|
|
|
const int64_t nb3 = dst->nb[3];
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
|
|
|
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
|
|
|
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
2023-12-21 17:42:59 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t i02_divisor = ne12 / ne02;
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const size_t src0_ts = ggml_type_size(src0->type);
|
|
|
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
|
|
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
|
|
|
const size_t q8_1_bs = QK8_1;
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
|
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
|
|
|
GGML_ASSERT(!(split && ne02 > 1));
|
|
|
|
GGML_ASSERT(!(split && ne03 > 1));
|
|
|
|
GGML_ASSERT(!(split && ne02 < ne12));
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
|
2023-12-13 12:04:25 +00:00
|
|
|
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
|
|
if (split) {
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
|
|
tensor_split = buft_ctx->tensor_split;
|
|
|
|
}
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
struct dev_data {
|
|
|
|
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
|
|
|
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
|
|
|
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
|
|
|
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
char * src0_dd = nullptr;
|
|
|
|
float * src1_ddf = nullptr; // float
|
|
|
|
char * src1_ddq = nullptr; // q8_1
|
|
|
|
float * dst_dd = nullptr;
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t row_low;
|
|
|
|
int64_t row_high;
|
|
|
|
};
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
dev_data dev[GGML_CUDA_MAX_DEVICES];
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int used_devices = 0;
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
// by default, use all rows
|
|
|
|
dev[id].row_low = 0;
|
|
|
|
dev[id].row_high = ne01;
|
|
|
|
|
|
|
|
// for multi GPU, get the row boundaries from tensor split
|
|
|
|
// and round to mul_mat_q tile sizes
|
|
|
|
if (split) {
|
|
|
|
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
|
|
|
|
|
|
|
if (id != 0) {
|
|
|
|
dev[id].row_low = ne01*tensor_split[id];
|
|
|
|
if (dev[id].row_low < ne01) {
|
|
|
|
dev[id].row_low -= dev[id].row_low % rounding;
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (id != ggml_backend_cuda_get_device_count() - 1) {
|
|
|
|
dev[id].row_high = ne01*tensor_split[id + 1];
|
|
|
|
if (dev[id].row_high < ne01) {
|
|
|
|
dev[id].row_high -= dev[id].row_high % rounding;
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
|
|
|
continue;
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
used_devices++;
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool src1_on_device = id == src1_ctx->device;
|
|
|
|
const bool dst_on_device = id == dst_ctx->device;
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(id);
|
|
|
|
cudaStream_t stream = ctx.stream(id, 0);
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src0_is_contiguous) {
|
|
|
|
dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
|
|
|
|
} else {
|
|
|
|
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src1_on_device && src1_is_contiguous) {
|
|
|
|
dev[id].src1_ddf = (float *) src1->data;
|
|
|
|
} else {
|
|
|
|
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
|
|
|
}
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (convert_src1_to_q8_1) {
|
|
|
|
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
2023-12-20 14:41:22 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src1_on_device && src1_is_contiguous) {
|
|
|
|
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
2023-12-20 14:41:22 +00:00
|
|
|
}
|
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
|
|
|
|
if (dst_on_device) {
|
|
|
|
dev[id].dst_dd = (float *) dst->data;
|
|
|
|
} else {
|
|
|
|
const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
|
|
|
|
dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
|
|
|
|
}
|
2023-12-13 12:04:25 +00:00
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// if multiple devices are used they need to wait for the main device
|
|
|
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
|
|
|
if (split && used_devices > 1) {
|
|
|
|
ggml_cuda_set_device(ctx.device);
|
|
|
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
|
|
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
|
|
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
|
|
|
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
2023-10-10 07:50:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
|
|
|
continue;
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool src1_on_device = id == src1_ctx->device;
|
|
|
|
const bool dst_on_device = id == dst_ctx->device;
|
|
|
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(id);
|
|
|
|
cudaStream_t stream = ctx.stream(id, is);
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// wait for main GPU data if necessary
|
|
|
|
if (split && (id != ctx.device || is != 0)) {
|
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
|
|
|
|
}
|
2024-01-30 14:21:57 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
|
|
|
const int64_t i03 = i0 / ne12;
|
|
|
|
const int64_t i02 = i0 % ne12;
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
|
|
|
|
|
|
|
// for split tensors the data begins at i0 == i0_offset_low
|
|
|
|
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
|
|
|
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
|
|
|
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
|
|
|
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
|
|
|
// in that case an offset on dst_ddf_i is needed
|
|
|
|
if (id == ctx.device) {
|
|
|
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
|
|
|
}
|
2024-01-29 12:37:33 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// copy src0, src1 to device if necessary
|
|
|
|
if (src1_is_contiguous) {
|
|
|
|
if (id != ctx.device) {
|
|
|
|
if (convert_src1_to_q8_1) {
|
|
|
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
|
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
|
|
|
|
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
|
|
|
} else {
|
|
|
|
float * src1_ddf_i_source = (float *) src1->data;
|
|
|
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
|
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
|
|
|
|
src1_ncols*ne10*sizeof(float), stream));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (src1_on_device && !src1_is_contiguous) {
|
|
|
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
|
|
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
|
|
|
} else {
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
|
|
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
|
|
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// do the computation
|
|
|
|
op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
|
|
|
dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// copy dst to host or other device if necessary
|
|
|
|
if (!dst_on_device) {
|
|
|
|
void * dst_off_device = dst->data;
|
|
|
|
if (split) {
|
|
|
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
|
|
|
// dst is NOT transposed.
|
|
|
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
|
|
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
|
|
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
|
|
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
|
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
|
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
|
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
|
|
|
cudaMemcpy3DPeerParms p = {};
|
|
|
|
p.dstDevice = ctx.device;
|
|
|
|
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
|
|
|
p.srcDevice = id;
|
|
|
|
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
|
|
|
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
|
|
|
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
|
|
|
#else
|
|
|
|
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
|
|
|
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
|
|
|
dst_dd_i, row_diff*sizeof(float),
|
|
|
|
row_diff*sizeof(float), src1_ncols,
|
|
|
|
cudaMemcpyDeviceToDevice, stream));
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
|
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
|
|
dhf_dst_i += src1_col_0*ne0;
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
|
|
|
}
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// add event for the main device to wait on until other device is done
|
|
|
|
if (split && (id != ctx.device || is != 0)) {
|
|
|
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-06-14 17:47:19 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// main device waits for all other devices to be finished
|
|
|
|
if (split && ggml_backend_cuda_get_device_count() > 1) {
|
|
|
|
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
|
|
|
is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
|
2023-06-14 17:47:19 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(ctx.device);
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
if (dev[id].row_low == dev[id].row_high) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
for (int64_t is = 0; is < is_max; ++is) {
|
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-07-17 17:39:29 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
|
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
|
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
|
|
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
const int64_t ne01 = src0->ne[1];
|
|
|
|
const int64_t ne02 = src0->ne[2];
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne12 = src1->ne[2];
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
cudaStream_t main_stream = ctx.stream();
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * src0_ddq = src0->data;
|
|
|
|
float * src1_ddf = (float *) src1->data;
|
|
|
|
float * dst_ddf = (float *) dst->data;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
|
|
|
}
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
|
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
|
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
|
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
const int64_t ne01 = src0->ne[1];
|
|
|
|
const int64_t ne02 = src0->ne[2];
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t nb01 = src0->nb[1];
|
|
|
|
const int64_t nb02 = src0->nb[2];
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const int64_t ne12 = src1->ne[2];
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
cudaStream_t main_stream = ctx.stream();
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
void * src0_ddq = src0->data;
|
|
|
|
float * src1_ddf = (float *) src1->data;
|
|
|
|
float * dst_ddf = (float *) dst->data;
|
|
|
|
|
|
|
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
|
|
|
const int64_t channel_stride_x = nb02 / sizeof(half);
|
|
|
|
|
|
|
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static __global__ void k_compute_batched_ptrs(
|
|
|
|
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
|
|
|
const void ** ptrs_src, void ** ptrs_dst,
|
|
|
|
int64_t ne12, int64_t ne13,
|
|
|
|
int64_t ne23,
|
|
|
|
size_t nb02, size_t nb03,
|
|
|
|
size_t nb12, size_t nb13,
|
|
|
|
size_t nbd2, size_t nbd3,
|
|
|
|
int64_t r2, int64_t r3) {
|
|
|
|
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (i13 >= ne13 || i12 >= ne12) {
|
2023-10-08 17:19:14 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t i03 = i13 / r3;
|
|
|
|
int64_t i02 = i12 / r2;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
|
|
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
|
|
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
|
|
|
}
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
|
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
|
|
|
|
const int64_t ne_dst = ggml_nelements(dst);
|
|
|
|
|
|
|
|
cudaStream_t main_stream = ctx.stream();
|
|
|
|
|
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
|
|
|
|
|
|
|
|
void * src0_ddq = src0->data;
|
|
|
|
half * src0_f16 = (half *) src0_ddq;
|
|
|
|
float * src1_ddf = (float *) src1->data;
|
|
|
|
float * dst_ddf = (float *) dst->data;
|
|
|
|
|
|
|
|
// convert src1 to fp16
|
|
|
|
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
|
|
|
if (src1->type != GGML_TYPE_F16) {
|
|
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
|
|
const int64_t ne_src1 = ggml_nelements(src1);
|
|
|
|
src1_f16_alloc.alloc(ne_src1);
|
|
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
|
|
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
|
|
|
|
char * dst_t;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
|
|
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// dst strides
|
|
|
|
size_t nbd2 = dst->nb[2];
|
|
|
|
size_t nbd3 = dst->nb[3];
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const half alpha_f16 = 1.0f;
|
|
|
|
const half beta_f16 = 0.0f;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const float alpha_f32 = 1.0f;
|
|
|
|
const float beta_f32 = 0.0f;
|
2023-12-21 20:07:46 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const void * alpha = &alpha_f16;
|
|
|
|
const void * beta = &beta_f16;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
dst_t = (char *) dst_f16.alloc(ne_dst);
|
|
|
|
|
|
|
|
nbd2 /= sizeof(float) / sizeof(half);
|
|
|
|
nbd3 /= sizeof(float) / sizeof(half);
|
|
|
|
} else {
|
|
|
|
dst_t = (char *) dst_ddf;
|
|
|
|
|
|
|
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
|
|
|
cu_data_type = CUDA_R_32F;
|
|
|
|
|
|
|
|
alpha = &alpha_f32;
|
|
|
|
beta = &beta_f32;
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
2024-03-13 17:54:21 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
|
|
GGML_ASSERT(ne13 % ne03 == 0);
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// broadcast factors
|
|
|
|
const int64_t r2 = ne12/ne02;
|
|
|
|
const int64_t r3 = ne13/ne03;
|
2023-12-21 20:07:46 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#if 0
|
|
|
|
// use cublasGemmEx
|
|
|
|
{
|
|
|
|
for (int i13 = 0; i13 < ne13; ++i13) {
|
|
|
|
for (int i12 = 0; i12 < ne12; ++i12) {
|
|
|
|
int i03 = i13 / r3;
|
|
|
|
int i02 = i12 / r2;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUBLAS_CHECK(
|
|
|
|
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
|
|
|
ne01, ne11, ne10,
|
|
|
|
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
|
|
|
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
|
|
|
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
|
|
|
cu_compute_type,
|
|
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
|
|
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
|
|
|
// use cublasGemmStridedBatchedEx
|
|
|
|
CUBLAS_CHECK(
|
|
|
|
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
|
|
ne01, ne11, ne10,
|
|
|
|
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
|
|
|
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
|
|
|
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
|
|
|
ne12*ne13,
|
|
|
|
cu_compute_type,
|
|
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
} else {
|
|
|
|
// use cublasGemmBatchedEx
|
|
|
|
const int ne23 = ne12*ne13;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
|
|
|
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
|
|
|
|
|
|
|
dim3 block_dims(ne13, ne12);
|
|
|
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
|
|
|
src0_f16, src1_f16, dst_t,
|
|
|
|
ptrs_src.get(), ptrs_dst.get(),
|
|
|
|
ne12, ne13,
|
|
|
|
ne23,
|
|
|
|
nb02, nb03,
|
|
|
|
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
|
|
|
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
|
|
|
nbd2, nbd3,
|
|
|
|
r2, r3);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUBLAS_CHECK(
|
|
|
|
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
|
|
ne01, ne11, ne10,
|
|
|
|
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
|
|
|
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
|
|
|
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
|
|
|
ne23,
|
|
|
|
cu_compute_type,
|
|
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
}
|
|
|
|
#endif
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
|
|
|
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t min_compute_capability = INT_MAX;
|
2023-11-13 12:16:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
bool any_pascal_with_slow_fp16 = false;
|
|
|
|
if (split) {
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
|
|
auto & tensor_split = buft_ctx->tensor_split;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
|
|
// skip devices that are not going to do any work:
|
|
|
|
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
|
|
|
continue;
|
|
|
|
}
|
2023-11-13 12:16:23 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
|
|
|
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-03-25 12:50:23 +00:00
|
|
|
if (ggml_cuda_info().devices[id].cc == 610) {
|
2024-03-20 13:42:59 +00:00
|
|
|
any_pascal_with_slow_fp16 = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2024-03-25 12:50:23 +00:00
|
|
|
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
|
|
|
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
2023-11-13 12:16:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
|
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
|
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
|
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
|
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
|
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#ifdef CUDA_USE_TENSOR_CORES
|
|
|
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
|
|
|
#endif // CUDA_USE_TENSOR_CORES
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#else
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
|
|
|
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
|
|
|
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
|
|
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#ifdef CUDA_USE_TENSOR_CORES
|
|
|
|
// when tensor cores are available, use them for large batch size
|
|
|
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
|
|
|
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
|
|
#endif // CUDA_USE_TENSOR_CORES
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// if mmvq is available it's a better choice than dmmv:
|
|
|
|
#ifndef GGML_CUDA_FORCE_DMMV
|
|
|
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
|
|
#endif // GGML_CUDA_FORCE_DMMV
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
// debug helpers
|
|
|
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
|
|
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
|
|
|
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
|
|
|
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
|
|
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
|
|
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
2024-03-18 10:03:04 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
|
|
|
// KQ single-batch
|
|
|
|
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
|
|
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
|
|
|
// KQV single-batch
|
|
|
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
2024-04-13 22:21:55 +00:00
|
|
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
2024-03-20 13:42:59 +00:00
|
|
|
// KQ + KQV multi-batch
|
|
|
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
|
|
|
} else if (use_dequantize_mul_mat_vec) {
|
|
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
|
|
|
} else if (use_mul_mat_vec_q) {
|
|
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
|
|
|
} else if (use_mul_mat_q) {
|
|
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
|
|
|
} else {
|
|
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
struct mmid_row_mapping {
|
|
|
|
int32_t i1;
|
|
|
|
int32_t i2;
|
|
|
|
};
|
|
|
|
|
|
|
|
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
|
|
|
|
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
|
|
|
|
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
|
|
|
int64_t ne11, int64_t ne10,
|
|
|
|
size_t nb11, size_t nb12) {
|
|
|
|
int32_t iid1 = blockIdx.x;
|
|
|
|
int32_t id = blockIdx.y;
|
|
|
|
|
|
|
|
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
|
|
|
|
|
|
|
if (row_id_i != i02) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const int64_t i11 = id % ne11;
|
|
|
|
const int64_t i12 = iid1;
|
|
|
|
|
|
|
|
__shared__ int src1_row;
|
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
src1_row = atomicAdd(cur_src1_row, 1);
|
|
|
|
row_mapping[src1_row] = {id, iid1};
|
|
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
|
|
|
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
|
|
|
|
|
|
|
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
|
|
|
|
src1_row_contiguous[i] = src1_row_original[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
|
|
|
|
const mmid_row_mapping * __restrict__ row_mapping,
|
|
|
|
int64_t ne0,
|
|
|
|
size_t nb1, size_t nb2) {
|
|
|
|
int32_t i = blockIdx.x;
|
|
|
|
|
|
|
|
const int32_t i1 = row_mapping[i].i1;
|
|
|
|
const int32_t i2 = row_mapping[i].i2;
|
|
|
|
|
|
|
|
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
|
|
|
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
|
|
|
|
|
|
|
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
|
|
|
|
dst_row_original[j] = dst_row_contiguous[j];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
|
const ggml_tensor * src0 = dst->src[0];
|
|
|
|
const ggml_tensor * src1 = dst->src[1];
|
2024-04-03 13:07:05 +00:00
|
|
|
const ggml_tensor * ids = dst->src[2];
|
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
2024-03-25 12:50:23 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
cudaStream_t stream = ctx.stream();
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
const int64_t n_as = ne02;
|
|
|
|
const int64_t n_ids = ids->ne[0];
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
|
|
const char * ids_dev = (const char *) ids->data;
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
ggml_tensor src0_row = *src0;
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_tensor src1_row = *src1;
|
2024-04-18 13:18:48 +00:00
|
|
|
ggml_tensor dst_row = *dst;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
char * src0_original = (char *) src0->data;
|
2024-03-20 13:42:59 +00:00
|
|
|
char * src1_original = (char *) src1->data;
|
|
|
|
char * dst_original = (char *) dst->data;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
src0_row.ne[2] = 1;
|
|
|
|
src0_row.ne[3] = 1;
|
2024-04-18 13:18:48 +00:00
|
|
|
src0_row.nb[3] = nb02;
|
2024-04-03 13:07:05 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
src1_row.ne[1] = 1;
|
|
|
|
src1_row.ne[2] = 1;
|
|
|
|
src1_row.ne[3] = 1;
|
|
|
|
src1_row.nb[2] = nb11;
|
|
|
|
src1_row.nb[3] = nb11;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
dst_row.ne[1] = 1;
|
|
|
|
dst_row.ne[2] = 1;
|
|
|
|
dst_row.ne[3] = 1;
|
|
|
|
dst_row.nb[2] = nb1;
|
|
|
|
dst_row.nb[3] = nb1;
|
2024-03-13 17:54:21 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
if (ne12 == 1) {
|
|
|
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
|
|
for (int64_t id = 0; id < n_ids; id++) {
|
|
|
|
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
|
|
|
|
|
|
const int64_t i11 = id % ne11;
|
|
|
|
const int64_t i12 = iid1;
|
|
|
|
|
|
|
|
const int64_t i1 = id;
|
|
|
|
const int64_t i2 = i12;
|
|
|
|
|
|
|
|
src0_row.data = src0_original + i02*nb02;
|
|
|
|
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
|
|
|
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
|
|
|
|
|
|
|
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
|
|
|
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
src1_row.data = src1_contiguous.get();
|
|
|
|
dst_row.data = dst_contiguous.get();
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
2024-03-20 13:42:59 +00:00
|
|
|
int64_t num_src1_rows = 0;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
|
|
for (int64_t id = 0; id < n_ids; id++) {
|
|
|
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
if (row_id_i != i02) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
num_src1_rows++;
|
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
if (num_src1_rows == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
|
|
|
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
|
|
|
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
|
2024-04-03 13:07:05 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
{
|
|
|
|
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
|
|
|
dim3 grid_dims(ids->ne[1], n_ids);
|
|
|
|
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
|
|
|
src1_original, src1_contiguous.get(),
|
|
|
|
dev_cur_src1_row.get(), dev_row_mapping.get(),
|
|
|
|
ids_dev, i02, ids->nb[1], ids->nb[0],
|
|
|
|
ne11, ne10,
|
|
|
|
nb11, nb12);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
}
|
|
|
|
|
|
|
|
src0_row.data = src0_original + i02*nb02;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
|
|
|
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
|
|
|
|
|
|
|
src1_row.ne[1] = num_src1_rows;
|
2024-03-20 13:42:59 +00:00
|
|
|
src1_row.nb[1] = nb11;
|
|
|
|
src1_row.nb[2] = num_src1_rows*nb11;
|
|
|
|
src1_row.nb[3] = num_src1_rows*nb11;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
dst_row.ne[1] = num_src1_rows;
|
2024-03-20 13:42:59 +00:00
|
|
|
dst_row.nb[1] = nb1;
|
|
|
|
dst_row.nb[2] = num_src1_rows*nb1;
|
|
|
|
dst_row.nb[3] = num_src1_rows*nb1;
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-03 13:07:05 +00:00
|
|
|
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
{
|
|
|
|
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
|
|
|
dim3 grid_dims(num_src1_rows);
|
|
|
|
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
|
|
|
dst_original, dst_contiguous.get(),
|
|
|
|
dev_row_mapping.get(),
|
|
|
|
ne0,
|
|
|
|
nb1, nb2);
|
|
|
|
CUDA_CHECK(cudaGetLastError());
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
|
|
|
}
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
|
|
|
// why is this here instead of mul_mat?
|
|
|
|
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
|
|
|
|
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
2023-12-23 15:10:51 +00:00
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
switch (dst->op) {
|
2024-03-20 13:42:59 +00:00
|
|
|
case GGML_OP_REPEAT:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_repeat(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_GET_ROWS:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_get_rows(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_DUP:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_dup(ctx, dst);
|
|
|
|
break;
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
|
|
|
|
break;
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
ggml_cuda_dup(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ADD:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_add(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ACC:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_acc(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_MUL:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_mul(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_DIV:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_div(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_UNARY:
|
2024-03-25 12:50:23 +00:00
|
|
|
switch (ggml_get_unary_op(dst)) {
|
2024-03-20 13:42:59 +00:00
|
|
|
case GGML_UNARY_OP_GELU:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_gelu(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_SILU:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_silu(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_GELU_QUICK:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_gelu_quick(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_TANH:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_tanh(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_RELU:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_relu(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_UNARY_OP_HARDSWISH:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_hardswish(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case GGML_OP_NORM:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_norm(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_GROUP_NORM:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_group_norm(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_CONCAT:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_concat(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_UPSCALE:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_upscale(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_PAD:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_pad(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ARANGE:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_arange(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_timestep_embedding(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_LEAKY_RELU:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_leaky_relu(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_RMS_NORM:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_rms_norm(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_MUL_MAT:
|
2024-03-25 12:50:23 +00:00
|
|
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
|
|
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2024-03-20 13:42:59 +00:00
|
|
|
return false;
|
|
|
|
} else {
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case GGML_OP_MUL_MAT_ID:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_mul_mat_id(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_SCALE:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_scale(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_SQR:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_sqr(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_CLAMP:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_clamp(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_NONE:
|
|
|
|
case GGML_OP_RESHAPE:
|
|
|
|
case GGML_OP_VIEW:
|
|
|
|
case GGML_OP_PERMUTE:
|
|
|
|
case GGML_OP_TRANSPOSE:
|
2024-03-25 12:50:23 +00:00
|
|
|
break;
|
2024-03-20 13:42:59 +00:00
|
|
|
case GGML_OP_DIAG_MASK_INF:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_diag_mask_inf(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_SOFT_MAX:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_soft_max(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ROPE:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_rope(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ALIBI:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_alibi(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_IM2COL:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_im2col(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_POOL_2D:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_pool2d(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_SUM_ROWS:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_sum_rows(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
case GGML_OP_ARGSORT:
|
2024-03-25 12:50:23 +00:00
|
|
|
ggml_cuda_op_argsort(ctx, dst);
|
2024-03-20 13:42:59 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-03-25 12:50:23 +00:00
|
|
|
cudaError_t err = cudaGetLastError();
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2024-04-03 13:07:05 +00:00
|
|
|
CUDA_CHECK(err);
|
2024-03-25 12:50:23 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
return true;
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
2023-12-07 20:26:54 +00:00
|
|
|
|
|
|
|
// backend
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-01-12 19:07:38 +00:00
|
|
|
return cuda_ctx->name.c_str();
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
|
|
|
delete cuda_ctx;
|
|
|
|
delete backend;
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
|
|
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2024-03-13 17:54:21 +00:00
|
|
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2024-03-13 17:54:21 +00:00
|
|
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
|
|
|
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
|
|
|
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
|
|
|
|
|
|
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
|
|
return false;
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// device -> device
|
|
|
|
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
|
|
|
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
|
|
|
|
|
|
|
if (backend_src != backend_dst) {
|
|
|
|
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
|
|
|
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
|
|
|
|
|
|
|
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
|
|
|
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
|
|
|
|
|
|
|
// copy on src stream
|
|
|
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
2024-03-13 17:54:21 +00:00
|
|
|
} else {
|
2024-03-22 13:05:31 +00:00
|
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
|
|
return false;
|
|
|
|
#else
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
2024-03-22 13:05:31 +00:00
|
|
|
#endif
|
2024-03-13 17:54:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// record event on src stream
|
2024-03-22 13:05:31 +00:00
|
|
|
if (!cuda_ctx_src->copy_event) {
|
|
|
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
|
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
2024-03-13 17:54:21 +00:00
|
|
|
|
|
|
|
// wait on dst stream for the copy to complete
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
2024-03-13 17:54:21 +00:00
|
|
|
} else {
|
|
|
|
// src and dst are on the same backend
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
2024-03-13 17:54:21 +00:00
|
|
|
}
|
|
|
|
return true;
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(backend);
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-04-25 07:51:48 +00:00
|
|
|
#if (CUDART_VERSION >= 12000) && defined(GGML_ALLOW_CUDA_GRAPHS)
|
2024-04-22 08:32:06 +00:00
|
|
|
#define USE_CUDA_GRAPH
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef USE_CUDA_GRAPH
|
|
|
|
#define MAX_NODES_IN_CUDA_GRAPH 10000
|
2024-04-24 12:43:26 +00:00
|
|
|
struct ggml_cuda_graph {
|
|
|
|
int count = 0;
|
2024-04-19 12:09:03 +00:00
|
|
|
cudaGraph_t graph = nullptr;
|
|
|
|
cudaGraphExec_t instance = nullptr;
|
2024-04-24 12:43:26 +00:00
|
|
|
size_t num_nodes = 0;
|
2024-04-19 12:09:03 +00:00
|
|
|
int softmax_ne0 = 0;
|
2024-04-22 08:32:06 +00:00
|
|
|
cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH];
|
2024-04-22 11:50:39 +00:00
|
|
|
cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
|
2024-04-24 12:43:26 +00:00
|
|
|
bool disable_due_to_gpu_arch = false;
|
2024-04-19 12:09:03 +00:00
|
|
|
};
|
2024-04-22 08:32:06 +00:00
|
|
|
#endif
|
2024-04-19 12:09:03 +00:00
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
const bool disable_cuda_graphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr);
|
2024-04-23 13:27:08 +00:00
|
|
|
|
2024-03-04 09:05:42 +00:00
|
|
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
ggml_cuda_set_device(cuda_ctx->device);
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-04-22 08:32:06 +00:00
|
|
|
#ifdef USE_CUDA_GRAPH
|
2024-04-19 12:09:03 +00:00
|
|
|
// Objects required for CUDA Graph
|
2024-04-24 12:43:26 +00:00
|
|
|
static ggml_cuda_graph cuda_graph;
|
|
|
|
bool use_cuda_graph = (cuda_graph.count >= 7); //avoid CUDA graphs on first few steps due to incompatible initialisations.
|
|
|
|
char ** updated_kernel_arg[MAX_NODES_IN_CUDA_GRAPH];
|
|
|
|
bool cuda_graph_update_required = false;
|
2024-04-19 12:09:03 +00:00
|
|
|
// pointer to CUDA cpy kernel, which is required to identify
|
|
|
|
// kernel parameters which need updated in the graph for each token
|
2024-04-24 12:43:26 +00:00
|
|
|
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
2024-04-22 16:01:44 +00:00
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(cuda_graph.count == 0){
|
|
|
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < 800){
|
|
|
|
cuda_graph.disable_due_to_gpu_arch=true;
|
2024-04-23 13:27:08 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Disable CUDA graphs in presence of env var or old GPU.
|
|
|
|
// Also disable for multi-gpu for now. TO DO investigate
|
2024-04-24 12:43:26 +00:00
|
|
|
if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || ggml_backend_cuda_get_device_count() > 1){
|
|
|
|
use_cuda_graph = false;
|
2024-04-22 16:01:44 +00:00
|
|
|
}
|
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(use_cuda_graph) {
|
2024-04-19 12:09:03 +00:00
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(cuda_graph.instance == nullptr) cuda_graph_update_required=true;
|
2024-04-19 12:09:03 +00:00
|
|
|
|
|
|
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
|
|
|
int k=0;
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
ggml_tensor * node = cgraph->nodes[i];
|
2024-04-24 13:31:08 +00:00
|
|
|
// Identify if the graph needs to be updated for this token due to the number of elements changing
|
2024-04-19 12:09:03 +00:00
|
|
|
// (identified by inspecting soft max op parameters)
|
|
|
|
if(node->op == GGML_OP_SOFT_MAX) {
|
2024-04-22 16:01:44 +00:00
|
|
|
if(node->src[1]->ne[1] > 1){
|
2024-04-24 12:43:26 +00:00
|
|
|
use_cuda_graph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate
|
2024-04-22 16:01:44 +00:00
|
|
|
}
|
2024-04-24 12:43:26 +00:00
|
|
|
if(node->src[0]->ne[0] != cuda_graph.softmax_ne0) {
|
|
|
|
cuda_graph_update_required = true;
|
|
|
|
cuda_graph.softmax_ne0 = node->src[0]->ne[0];
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if(node->op == GGML_OP_CPY) {
|
|
|
|
// store the copy op parameter which changes with each token.
|
2024-04-24 12:43:26 +00:00
|
|
|
updated_kernel_arg[k++]=(char **) &(node->src[1]->data);
|
|
|
|
if(ggml_cuda_cpy_fn_ptr == nullptr){
|
2024-04-19 12:09:03 +00:00
|
|
|
// store a pointer to the copy op CUDA kernel to identify it later
|
2024-04-24 12:43:26 +00:00
|
|
|
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
2024-04-19 12:09:03 +00:00
|
|
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeGlobal));
|
|
|
|
}
|
|
|
|
|
2024-04-22 08:32:06 +00:00
|
|
|
#else
|
2024-04-24 12:43:26 +00:00
|
|
|
bool use_cuda_graph = false;
|
|
|
|
bool cuda_graph_update_required = false;
|
2024-04-24 13:31:08 +00:00
|
|
|
#endif // USE_CUDA_GRAPH
|
2024-04-22 08:32:06 +00:00
|
|
|
|
2024-04-24 13:31:08 +00:00
|
|
|
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
|
|
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
2024-04-24 12:43:26 +00:00
|
|
|
if(!use_cuda_graph || cuda_graph_update_required) {
|
2024-04-19 12:09:03 +00:00
|
|
|
//temporarily avoid indenting here to make code review easier
|
2023-10-08 17:19:14 +00:00
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
|
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 14:46:41 +00:00
|
|
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
2023-11-13 12:16:23 +00:00
|
|
|
continue;
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-01-12 19:07:38 +00:00
|
|
|
#ifndef NDEBUG
|
2023-12-07 20:26:54 +00:00
|
|
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
2023-10-08 17:19:14 +00:00
|
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
|
if (node->src[j] != nullptr) {
|
2024-02-19 22:40:26 +00:00
|
|
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
}
|
2024-01-12 19:07:38 +00:00
|
|
|
#endif
|
2023-10-08 17:19:14 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2023-10-08 17:19:14 +00:00
|
|
|
if (!ok) {
|
|
|
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
|
|
}
|
|
|
|
GGML_ASSERT(ok);
|
|
|
|
}
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
|
2024-04-24 13:31:08 +00:00
|
|
|
#ifdef USE_CUDA_GRAPH
|
2024-04-24 12:43:26 +00:00
|
|
|
if(use_cuda_graph && (cuda_graph_update_required)) { // End CUDA graph capture
|
|
|
|
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph.graph));
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
2024-04-24 12:43:26 +00:00
|
|
|
if(use_cuda_graph){
|
2024-04-19 12:09:03 +00:00
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(cuda_graph.instance == nullptr) { // Create executable graph from captured graph.
|
|
|
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph.instance, cuda_graph.graph, NULL, NULL, 0));
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
if(cuda_graph_update_required) {
|
2024-04-19 12:09:03 +00:00
|
|
|
// Extract nodes from graph
|
2024-04-24 12:43:26 +00:00
|
|
|
if(cuda_graph.num_nodes == 0) {
|
|
|
|
// First call with null argument gets number of nodes in graph
|
|
|
|
CUDA_CHECK(cudaGraphGetNodes(cuda_graph.graph, nullptr, &cuda_graph.num_nodes));
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
2024-04-24 12:43:26 +00:00
|
|
|
// Subsequent call with non-null argument gets nodes
|
|
|
|
CUDA_CHECK(cudaGraphGetNodes(cuda_graph.graph, cuda_graph.nodes, &cuda_graph.num_nodes));
|
2024-04-19 12:09:03 +00:00
|
|
|
|
2024-04-24 13:31:08 +00:00
|
|
|
// Loop over nodes, and extract kernel parameters from each node
|
2024-04-24 12:43:26 +00:00
|
|
|
for(size_t i=0; i<cuda_graph.num_nodes; i++) {
|
|
|
|
cudaGraphNodeType node_type;
|
|
|
|
CUDA_CHECK(cudaGraphNodeGetType(cuda_graph.nodes[i], &node_type));
|
|
|
|
if (node_type == cudaGraphNodeTypeKernel) {
|
|
|
|
auto stat = cudaGraphKernelNodeGetParams(cuda_graph.nodes[i], &cuda_graph.params[i]); // Get params using runtime
|
|
|
|
if(stat == cudaErrorInvalidDeviceFunction) {
|
2024-04-22 08:32:06 +00:00
|
|
|
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
|
|
|
// We don't need to update blas nodes, so clear error and move on.
|
|
|
|
cudaGetLastError();
|
|
|
|
}
|
2024-04-24 12:43:26 +00:00
|
|
|
else {
|
|
|
|
GGML_ASSERT(stat == cudaSuccess);
|
|
|
|
}
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-24 12:43:26 +00:00
|
|
|
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
|
|
|
// replace that argument with the updated value in the CUDA graph
|
|
|
|
if(!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
2024-04-19 12:09:03 +00:00
|
|
|
int k=0;
|
2024-04-24 12:43:26 +00:00
|
|
|
for(size_t i=0; i<cuda_graph.num_nodes; i++) {
|
|
|
|
if(cuda_graph.params[i].func == ggml_cuda_cpy_fn_ptr) {
|
|
|
|
char ** updated_kernel_arg_ptr = updated_kernel_arg[k++];
|
|
|
|
cuda_graph.params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
|
|
|
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_graph.nodes[i], &cuda_graph.params[i]));
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update graph executable
|
2024-04-24 12:43:26 +00:00
|
|
|
cudaGraphExecUpdateResultInfo result_info;
|
|
|
|
auto stat = cudaGraphExecUpdate(cuda_graph.instance, cuda_graph.graph, &result_info);
|
|
|
|
if(stat == cudaErrorGraphExecUpdateFailure) {
|
2024-04-22 08:32:06 +00:00
|
|
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2024-04-24 12:43:26 +00:00
|
|
|
// so instead clear error and re-instantiate
|
2024-04-22 08:32:06 +00:00
|
|
|
cudaGetLastError();
|
2024-04-24 12:43:26 +00:00
|
|
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph.instance, cuda_graph.graph, NULL, NULL, 0));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
GGML_ASSERT(stat == cudaSuccess);
|
2024-04-22 08:32:06 +00:00
|
|
|
}
|
2024-04-19 12:09:03 +00:00
|
|
|
// Launch graph
|
2024-04-24 12:43:26 +00:00
|
|
|
CUDA_CHECK(cudaGraphLaunch(cuda_graph.instance, cuda_ctx->stream()));
|
2024-04-19 12:09:03 +00:00
|
|
|
}
|
2024-04-24 12:43:26 +00:00
|
|
|
cuda_graph.count++;
|
2024-04-24 13:31:08 +00:00
|
|
|
#endif // USE_CUDA_GRAPH
|
2024-03-04 09:05:42 +00:00
|
|
|
return GGML_STATUS_SUCCESS;
|
2023-10-08 17:19:14 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2023-12-07 20:26:54 +00:00
|
|
|
switch (op->op) {
|
|
|
|
case GGML_OP_UNARY:
|
|
|
|
switch (ggml_get_unary_op(op)) {
|
|
|
|
case GGML_UNARY_OP_GELU:
|
|
|
|
case GGML_UNARY_OP_SILU:
|
|
|
|
case GGML_UNARY_OP_RELU:
|
2024-01-31 13:10:15 +00:00
|
|
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
|
|
case GGML_UNARY_OP_HARDSWISH:
|
2023-12-13 19:54:54 +00:00
|
|
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
|
|
case GGML_UNARY_OP_TANH:
|
2023-12-07 20:26:54 +00:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case GGML_OP_MUL_MAT:
|
|
|
|
case GGML_OP_MUL_MAT_ID:
|
|
|
|
{
|
|
|
|
struct ggml_tensor * a;
|
|
|
|
struct ggml_tensor * b;
|
|
|
|
if (op->op == GGML_OP_MUL_MAT) {
|
|
|
|
a = op->src[0];
|
|
|
|
b = op->src[1];
|
|
|
|
} else {
|
|
|
|
a = op->src[2];
|
|
|
|
b = op->src[1];
|
|
|
|
}
|
|
|
|
if (a->ne[3] != b->ne[3]) {
|
|
|
|
return false;
|
|
|
|
}
|
2024-01-17 16:54:56 +00:00
|
|
|
ggml_type a_type = a->type;
|
2024-02-21 09:39:52 +00:00
|
|
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
2024-02-26 16:28:38 +00:00
|
|
|
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
2024-03-26 14:21:27 +00:00
|
|
|
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
2024-01-17 16:54:56 +00:00
|
|
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
return true;
|
|
|
|
} break;
|
2023-12-13 12:04:25 +00:00
|
|
|
case GGML_OP_GET_ROWS:
|
|
|
|
{
|
|
|
|
switch (op->src[0]->type) {
|
|
|
|
case GGML_TYPE_F16:
|
|
|
|
case GGML_TYPE_F32:
|
|
|
|
case GGML_TYPE_Q4_0:
|
|
|
|
case GGML_TYPE_Q4_1:
|
|
|
|
case GGML_TYPE_Q5_0:
|
|
|
|
case GGML_TYPE_Q5_1:
|
|
|
|
case GGML_TYPE_Q8_0:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} break;
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
{
|
|
|
|
ggml_type src0_type = op->src[0]->type;
|
|
|
|
ggml_type src1_type = op->src[1]->type;
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
|
|
|
return true;
|
|
|
|
}
|
2024-03-21 07:27:57 +00:00
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-13 12:04:25 +00:00
|
|
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
|
|
|
return true;
|
|
|
|
}
|
2024-01-29 12:37:33 +00:00
|
|
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-13 12:04:25 +00:00
|
|
|
return false;
|
|
|
|
} break;
|
2024-01-03 11:01:44 +00:00
|
|
|
case GGML_OP_DUP:
|
|
|
|
case GGML_OP_REPEAT:
|
|
|
|
case GGML_OP_CONCAT:
|
|
|
|
{
|
|
|
|
ggml_type src0_type = op->src[0]->type;
|
2024-01-03 12:18:46 +00:00
|
|
|
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
2024-01-03 11:01:44 +00:00
|
|
|
} break;
|
2023-12-07 20:26:54 +00:00
|
|
|
case GGML_OP_NONE:
|
|
|
|
case GGML_OP_RESHAPE:
|
|
|
|
case GGML_OP_VIEW:
|
|
|
|
case GGML_OP_PERMUTE:
|
|
|
|
case GGML_OP_TRANSPOSE:
|
|
|
|
case GGML_OP_NORM:
|
|
|
|
case GGML_OP_ADD:
|
|
|
|
case GGML_OP_MUL:
|
|
|
|
case GGML_OP_DIV:
|
|
|
|
case GGML_OP_RMS_NORM:
|
|
|
|
case GGML_OP_SCALE:
|
|
|
|
case GGML_OP_SQR:
|
|
|
|
case GGML_OP_CLAMP:
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
case GGML_OP_SOFT_MAX:
|
|
|
|
case GGML_OP_ROPE:
|
|
|
|
case GGML_OP_ALIBI:
|
|
|
|
case GGML_OP_IM2COL:
|
2024-01-31 13:10:15 +00:00
|
|
|
case GGML_OP_POOL_2D:
|
2023-12-07 20:26:54 +00:00
|
|
|
case GGML_OP_SUM_ROWS:
|
|
|
|
case GGML_OP_ARGSORT:
|
2023-12-13 19:54:54 +00:00
|
|
|
case GGML_OP_ACC:
|
|
|
|
case GGML_OP_GROUP_NORM:
|
|
|
|
case GGML_OP_UPSCALE:
|
|
|
|
case GGML_OP_PAD:
|
2024-03-03 12:23:52 +00:00
|
|
|
case GGML_OP_ARANGE:
|
|
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2023-12-13 19:54:54 +00:00
|
|
|
case GGML_OP_LEAKY_RELU:
|
2023-12-07 20:26:54 +00:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(backend);
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 10:03:04 +00:00
|
|
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
|
|
const int min_batch_size = 32;
|
|
|
|
|
2024-04-18 13:18:48 +00:00
|
|
|
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
|
|
|
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
2024-03-18 10:03:04 +00:00
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(backend);
|
2024-03-18 10:03:04 +00:00
|
|
|
}
|
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
2024-03-22 13:05:31 +00:00
|
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
|
|
return nullptr;
|
|
|
|
#else
|
2024-03-13 17:54:21 +00:00
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
|
|
|
|
ggml_cuda_set_device(cuda_ctx->device);
|
|
|
|
|
|
|
|
cudaEvent_t event;
|
|
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
|
|
|
|
|
|
|
return new ggml_backend_event {
|
|
|
|
/* .backend = */ backend,
|
|
|
|
/* .context = */ event,
|
|
|
|
};
|
2024-03-22 13:05:31 +00:00
|
|
|
#endif
|
2024-03-13 17:54:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
|
|
|
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
|
|
|
|
|
|
|
delete event;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
|
2024-03-13 17:54:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
|
|
|
|
if (ggml_backend_is_cuda(event->backend)) {
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
|
2024-03-13 17:54:21 +00:00
|
|
|
} else {
|
2024-03-15 12:24:03 +00:00
|
|
|
#if 0
|
2024-03-13 17:54:21 +00:00
|
|
|
// untested
|
|
|
|
auto wait_fn = [](void * user_data) {
|
|
|
|
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
|
|
|
ggml_backend_event_synchronize(event);
|
|
|
|
};
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
2024-03-15 12:24:03 +00:00
|
|
|
#endif
|
|
|
|
GGML_ASSERT(false);
|
2024-03-13 17:54:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
|
|
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
|
|
|
}
|
|
|
|
|
2024-01-12 19:07:38 +00:00
|
|
|
static ggml_backend_i ggml_backend_cuda_interface = {
|
2023-12-07 20:26:54 +00:00
|
|
|
/* .get_name = */ ggml_backend_cuda_name,
|
|
|
|
/* .free = */ ggml_backend_cuda_free,
|
|
|
|
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
|
|
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
|
|
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
2024-01-12 19:07:38 +00:00
|
|
|
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
2023-12-07 20:26:54 +00:00
|
|
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
2024-01-12 19:07:38 +00:00
|
|
|
/* .graph_plan_create = */ NULL,
|
|
|
|
/* .graph_plan_free = */ NULL,
|
|
|
|
/* .graph_plan_compute = */ NULL,
|
2023-12-07 20:26:54 +00:00
|
|
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
|
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
2024-03-18 10:03:04 +00:00
|
|
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
2024-03-13 17:54:21 +00:00
|
|
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
|
|
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
|
|
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
|
|
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
|
|
|
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
2023-10-08 17:19:14 +00:00
|
|
|
};
|
|
|
|
|
2024-02-24 16:27:36 +00:00
|
|
|
static ggml_guid_t ggml_backend_cuda_guid() {
|
|
|
|
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
|
|
|
return &guid;
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2024-03-20 13:42:59 +00:00
|
|
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2023-12-07 20:26:54 +00:00
|
|
|
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2024-03-13 17:54:21 +00:00
|
|
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
|
|
|
if (ctx == nullptr) {
|
|
|
|
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
|
|
|
return nullptr;
|
|
|
|
}
|
2023-10-08 17:19:14 +00:00
|
|
|
|
|
|
|
ggml_backend_t cuda_backend = new ggml_backend {
|
2024-02-24 16:27:36 +00:00
|
|
|
/* .guid = */ ggml_backend_cuda_guid(),
|
2024-01-12 19:07:38 +00:00
|
|
|
/* .interface = */ ggml_backend_cuda_interface,
|
2023-10-08 17:19:14 +00:00
|
|
|
/* .context = */ ctx
|
|
|
|
};
|
|
|
|
|
|
|
|
return cuda_backend;
|
|
|
|
}
|
2023-12-07 20:26:54 +00:00
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
2024-02-24 16:27:36 +00:00
|
|
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
2024-03-25 12:50:23 +00:00
|
|
|
return ggml_cuda_info().device_count;
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
2024-03-20 13:42:59 +00:00
|
|
|
cudaDeviceProp prop;
|
|
|
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
|
|
|
snprintf(description, description_size, "%s", prop.name);
|
2024-01-12 19:07:38 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
2024-01-12 19:07:38 +00:00
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 10:03:04 +00:00
|
|
|
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
2024-03-21 18:54:28 +00:00
|
|
|
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
2024-03-18 10:03:04 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-04-04 12:49:24 +00:00
|
|
|
#if CUDART_VERSION >= 11100
|
2024-03-18 10:03:04 +00:00
|
|
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
// clear the error
|
|
|
|
cudaGetLastError();
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
|
|
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2024-04-04 12:49:24 +00:00
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif
|
2024-03-18 10:03:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
2024-03-21 18:54:28 +00:00
|
|
|
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-03-18 10:03:04 +00:00
|
|
|
cudaError_t err = cudaHostUnregister(buffer);
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
// clear the error
|
|
|
|
cudaGetLastError();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-12 19:07:38 +00:00
|
|
|
// backend registry
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
2023-12-07 20:26:54 +00:00
|
|
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
|
|
|
return cuda_backend;
|
|
|
|
|
2024-03-20 13:42:59 +00:00
|
|
|
GGML_UNUSED(params);
|
2023-12-07 20:26:54 +00:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
2023-12-13 12:04:25 +00:00
|
|
|
|
2024-01-16 11:16:33 +00:00
|
|
|
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
2024-03-20 13:42:59 +00:00
|
|
|
int device_count = ggml_backend_cuda_get_device_count();
|
2023-12-07 20:26:54 +00:00
|
|
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
|
|
|
for (int i = 0; i < device_count; i++) {
|
|
|
|
char name[128];
|
|
|
|
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
|
|
|
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
|
|
|
}
|
|
|
|
return device_count;
|
|
|
|
}
|