mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 20:04:35 +00:00
557410b8f0
* llama : greatly reduce logits memory usage * llama : more compact state saving and reloading * llama : fix lctx.n_outputs not being set before building graph * perplexity : adapt to the logits API changes * perplexity : fix Winogrande, use correct logits for second choice start The first logits used to evaluate the second choice were not from the end of the common prefix; instead, they were the logits from the end of the first choice. This has been corrected. The previous implementation sometimes had outliers in the scores of choices for some tasks, and the logic to skip choices words in the log-likelihood evaluation probably was an attempt to reduce those, but it was complex and didn't quite seem to be the right thing. This is simpler now, and the outlier scores aren't there anymore. * perplexity : normalize spaces and punctuation in Winogrande sentences * llama : fix embedding conditions * llama : fix llama_get_embeddings_ith when the resulting id is 0 * llama : fix wrong n_outputs in llama_set_inputs A mismatch happened when using a smaller n_ubatch than n_batch and then using llama_batch_get_one(). The decision of what n_outputs should be now almost fully depends on how lctx.n_outputs is set in llama_decode_internal. The conditions are simpler this way. * llama : when saving the state, recalculate n_outputs This ensures the correct number of outputs for the entire previous batch is stored in the session file, even when n_ubatch is smaller than n_batch. * llama : fix not-skipping outputs of non-causal models * llama : fix running a batch with n_outputs == 0 It previously worked because lctx.inp_out_ids was not initialized, so it pointed to some garbage address which was somehow still valid when I ran my tests. * llama : keep same graph topology even when n_outputs == 0 * ggml : saner ggml_can_repeat with empty tensors * ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1 * ggml : do not multi-thread ops returning empty tensors * ggml : make ggml_is_empty public and work with views * llama : use a vector for ctx->output_ids * llama : rework reallocation logic for llama_output_reserve Now comparing the actual size with the new total size of the output buffer to allow more efficient enabling and disabling of the embeddings and/or logits output in the future. * ggml : skip empty tensors in all backends * llama : fix llama_output_reserve nullptr deref when new_size is 0 * perplexity : make Winogrande work as it does on master The problems with the Winogrande implementation will need to be fixed in a separate PR to ease review. * llama : clearer error messages for invalid logits or embeddings ids * llama : assert all models that can have inp_out_ids Since the graph topology is now constant, this presence check can be done even when there are no outputs. * llama : assert logits and embd buffers exist before writing to them * llama : handle errors from llama_output_reserve at call sites * perplexity : make hellaswag and multiple-choice outputs identical to master Due to how the KV cache is updated, the logprobs for tokens in a batch are very slightly affected by the other tokens present in the batch, so to make hellaswag and multiple-choice return exactly the same results as on master, the last token of each sequence needs to be evaluated even though its output is not used at all. This will probably be changed back in the future to make these benchmarks a tiny bit faster. * perplexity : fix division by zero when using less than 100 multiple-choice tasks * llama : allow loading state saved with a different ctx size When loading a session file, the context size is now only required to be at least enough to load the KV cells contained in that session file, instead of requiring to use exactly the same context size as when saving. Doing this enables the use-case of extending or shrinking the context size of a saved session. This breaks existing session files because the meaning of kv_buf_size is slightly changed (previously it was the size of the whole KV cache, now it's only the size of the saved part of it). This allows for finer-grained sanity checks when loading in an effort to keep kv_buf_size useful even when the kv_size is changed. * llama : minor ggml-ci * readme : update recent API changes, and warn about Vulkan --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2844 lines
108 KiB
Plaintext
2844 lines
108 KiB
Plaintext
#include "ggml-cuda.h"
|
|
#include "ggml.h"
|
|
#include "ggml-backend-impl.h"
|
|
|
|
#include "ggml-cuda/common.cuh"
|
|
#include "ggml-cuda/acc.cuh"
|
|
#include "ggml-cuda/alibi.cuh"
|
|
#include "ggml-cuda/arange.cuh"
|
|
#include "ggml-cuda/argsort.cuh"
|
|
#include "ggml-cuda/binbcast.cuh"
|
|
#include "ggml-cuda/clamp.cuh"
|
|
#include "ggml-cuda/concat.cuh"
|
|
#include "ggml-cuda/convert.cuh"
|
|
#include "ggml-cuda/cpy.cuh"
|
|
#include "ggml-cuda/diagmask.cuh"
|
|
#include "ggml-cuda/dmmv.cuh"
|
|
#include "ggml-cuda/getrows.cuh"
|
|
#include "ggml-cuda/im2col.cuh"
|
|
#include "ggml-cuda/mmq.cuh"
|
|
#include "ggml-cuda/mmvq.cuh"
|
|
#include "ggml-cuda/norm.cuh"
|
|
#include "ggml-cuda/pad.cuh"
|
|
#include "ggml-cuda/pool2d.cuh"
|
|
#include "ggml-cuda/quantize.cuh"
|
|
#include "ggml-cuda/rope.cuh"
|
|
#include "ggml-cuda/scale.cuh"
|
|
#include "ggml-cuda/softmax.cuh"
|
|
#include "ggml-cuda/sumrows.cuh"
|
|
#include "ggml-cuda/tsembd.cuh"
|
|
#include "ggml-cuda/unary.cuh"
|
|
#include "ggml-cuda/upscale.cuh"
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <atomic>
|
|
#include <cinttypes>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <float.h>
|
|
#include <limits>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
|
|
[[noreturn]]
|
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
|
int id = -1; // in case cudaGetDevice fails
|
|
cudaGetDevice(&id);
|
|
|
|
fprintf(stderr, "CUDA error: %s\n", msg);
|
|
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
fprintf(stderr, " %s\n", stmt);
|
|
// abort with GGML_ASSERT to get a stack trace
|
|
GGML_ASSERT(!"CUDA error");
|
|
}
|
|
|
|
// this is faster on Windows
|
|
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
|
void ggml_cuda_set_device(int device) {
|
|
int current_device;
|
|
CUDA_CHECK(cudaGetDevice(¤t_device));
|
|
|
|
if (device == current_device) {
|
|
return;
|
|
}
|
|
|
|
CUDA_CHECK(cudaSetDevice(device));
|
|
}
|
|
|
|
int ggml_cuda_get_device() {
|
|
int id;
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
return id;
|
|
}
|
|
|
|
static ggml_cuda_device_info ggml_cuda_init() {
|
|
#ifdef __HIP_PLATFORM_AMD__
|
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
|
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
|
rocblas_initialize();
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
#endif
|
|
|
|
ggml_cuda_device_info info = {};
|
|
|
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
|
if (err != cudaSuccess) {
|
|
fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
|
return info;
|
|
}
|
|
|
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
|
|
|
int64_t total_vram = 0;
|
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
|
#else
|
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
|
#endif
|
|
#if defined(CUDA_USE_TENSOR_CORES)
|
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
|
#else
|
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
|
#endif
|
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
|
for (int id = 0; id < info.device_count; ++id) {
|
|
int device_vmm = 0;
|
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
CUdevice device;
|
|
CU_CHECK(cuDeviceGet(&device, id));
|
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
|
|
|
if (device_vmm) {
|
|
CUmemAllocationProp alloc_prop = {};
|
|
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
alloc_prop.location.id = id;
|
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
|
}
|
|
#endif // !defined(GGML_USE_HIPBLAS)
|
|
info.devices[id].vmm = !!device_vmm;
|
|
|
|
cudaDeviceProp prop;
|
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
|
|
|
info.default_tensor_split[id] = total_vram;
|
|
total_vram += prop.totalGlobalMem;
|
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
|
#else
|
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
}
|
|
|
|
for (int id = 0; id < info.device_count; ++id) {
|
|
info.default_tensor_split[id] /= total_vram;
|
|
}
|
|
|
|
// configure logging to stdout
|
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
|
|
|
return info;
|
|
}
|
|
|
|
const ggml_cuda_device_info & ggml_cuda_info() {
|
|
static ggml_cuda_device_info info = ggml_cuda_init();
|
|
return info;
|
|
}
|
|
|
|
// #define DEBUG_CUDA_MALLOC
|
|
|
|
// buffer pool for cuda (legacy)
|
|
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
static const int MAX_BUFFERS = 256;
|
|
|
|
int device;
|
|
struct ggml_cuda_buffer {
|
|
void * ptr = nullptr;
|
|
size_t size = 0;
|
|
};
|
|
|
|
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
|
|
size_t pool_size = 0;
|
|
|
|
explicit ggml_cuda_pool_leg(int device) :
|
|
device(device) {
|
|
}
|
|
|
|
~ggml_cuda_pool_leg() {
|
|
ggml_cuda_set_device(device);
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
ggml_cuda_buffer & b = buffer_pool[i];
|
|
if (b.ptr != nullptr) {
|
|
CUDA_CHECK(cudaFree(b.ptr));
|
|
pool_size -= b.size;
|
|
}
|
|
}
|
|
GGML_ASSERT(pool_size == 0);
|
|
}
|
|
|
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
int nnz = 0;
|
|
size_t max_size = 0;
|
|
#endif
|
|
size_t best_diff = 1ull << 36;
|
|
int ibest = -1;
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
ggml_cuda_buffer& b = buffer_pool[i];
|
|
if (b.ptr != nullptr) {
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
++nnz;
|
|
if (b.size > max_size) max_size = b.size;
|
|
#endif
|
|
if (b.size >= size) {
|
|
size_t diff = b.size - size;
|
|
if (diff < best_diff) {
|
|
best_diff = diff;
|
|
ibest = i;
|
|
if (!best_diff) {
|
|
void * ptr = b.ptr;
|
|
*actual_size = b.size;
|
|
b.ptr = nullptr;
|
|
b.size = 0;
|
|
return ptr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (ibest >= 0) {
|
|
ggml_cuda_buffer& b = buffer_pool[ibest];
|
|
void * ptr = b.ptr;
|
|
*actual_size = b.size;
|
|
b.ptr = nullptr;
|
|
b.size = 0;
|
|
return ptr;
|
|
}
|
|
void * ptr;
|
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
|
ggml_cuda_set_device(device);
|
|
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
|
*actual_size = look_ahead_size;
|
|
pool_size += look_ahead_size;
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
|
(uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
|
|
#endif
|
|
return ptr;
|
|
}
|
|
|
|
void free(void * ptr, size_t size) override {
|
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
ggml_cuda_buffer& b = buffer_pool[i];
|
|
if (b.ptr == nullptr) {
|
|
b.ptr = ptr;
|
|
b.size = size;
|
|
return;
|
|
}
|
|
}
|
|
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
|
ggml_cuda_set_device(device);
|
|
CUDA_CHECK(cudaFree(ptr));
|
|
pool_size -= size;
|
|
}
|
|
};
|
|
|
|
// pool with virtual memory
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
|
|
|
int device;
|
|
CUdeviceptr pool_addr = 0;
|
|
size_t pool_used = 0;
|
|
size_t pool_size = 0;
|
|
size_t granularity;
|
|
|
|
explicit ggml_cuda_pool_vmm(int device) :
|
|
device(device),
|
|
granularity(ggml_cuda_info().devices[device].vmm_granularity) {
|
|
}
|
|
|
|
~ggml_cuda_pool_vmm() {
|
|
if (pool_addr != 0) {
|
|
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
|
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
|
}
|
|
}
|
|
|
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
|
const size_t alignment = 128;
|
|
size = alignment * ((size + alignment - 1) / alignment);
|
|
|
|
size_t avail = pool_size - pool_used;
|
|
|
|
if (size > avail) {
|
|
// round up to the next multiple of the granularity
|
|
size_t reserve_size = size - avail;
|
|
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
|
|
|
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
|
|
|
// allocate more physical memory
|
|
CUmemAllocationProp prop = {};
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.location.id = device;
|
|
CUmemGenericAllocationHandle handle;
|
|
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
|
|
|
// reserve virtual address space (if not already reserved)
|
|
if (pool_addr == 0) {
|
|
CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
|
}
|
|
|
|
// map at the end of the pool
|
|
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
|
|
|
|
// the memory allocation handle is no longer needed after mapping
|
|
CU_CHECK(cuMemRelease(handle));
|
|
|
|
// set access
|
|
CUmemAccessDesc access = {};
|
|
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
access.location.id = device;
|
|
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
|
|
|
|
// add to the pool
|
|
pool_size += reserve_size;
|
|
|
|
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
|
// device, (unsigned long long) (pool_size/1024/1024),
|
|
// (unsigned long long) (reserve_size/1024/1024));
|
|
}
|
|
|
|
GGML_ASSERT(pool_addr != 0);
|
|
|
|
void * ptr = (void *) (pool_addr + pool_used);
|
|
*actual_size = size;
|
|
pool_used += size;
|
|
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
|
#endif
|
|
|
|
return ptr;
|
|
}
|
|
|
|
void free(void * ptr, size_t size) override {
|
|
#ifdef DEBUG_CUDA_MALLOC
|
|
printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
|
#endif
|
|
|
|
pool_used -= size;
|
|
|
|
// all deallocations must be in reverse order of the allocations
|
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
|
}
|
|
};
|
|
#endif // !defined(GGML_USE_HIPBLAS)
|
|
|
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
if (ggml_cuda_info().devices[device].vmm) {
|
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
|
}
|
|
#endif
|
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
|
}
|
|
|
|
// cuda buffer
|
|
|
|
struct ggml_backend_cuda_buffer_context {
|
|
int device;
|
|
void * dev_ptr = nullptr;
|
|
std::string name;
|
|
|
|
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
|
device(device), dev_ptr(dev_ptr),
|
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
|
}
|
|
|
|
~ggml_backend_cuda_buffer_context() {
|
|
CUDA_CHECK(cudaFree(dev_ptr));
|
|
}
|
|
};
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
return ctx->name.c_str();
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
|
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
delete ctx;
|
|
}
|
|
|
|
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
return ctx->dev_ptr;
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
tensor->backend = tensor->view_src->backend;
|
|
tensor->extra = tensor->view_src->extra;
|
|
return;
|
|
}
|
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
// initialize padding to 0 to avoid possible NaN values
|
|
size_t original_size = ggml_nbytes(tensor);
|
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
|
|
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
|
ggml_cuda_set_device(ctx->device);
|
|
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
|
}
|
|
}
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
|
if (src_ctx->device == dst_ctx->device) {
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
|
} else {
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
return false;
|
|
#else
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
|
#endif
|
|
}
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
return true;
|
|
}
|
|
return false;
|
|
|
|
GGML_UNUSED(buffer);
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
ggml_cuda_set_device(ctx->device);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
|
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
|
/* .reset = */ NULL,
|
|
};
|
|
|
|
// cuda buffer type
|
|
struct ggml_backend_cuda_buffer_type_context {
|
|
int device;
|
|
std::string name;
|
|
};
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
|
|
return ctx->name.c_str();
|
|
}
|
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
|
|
ggml_cuda_set_device(buft_ctx->device);
|
|
|
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
|
|
|
void * dev_ptr;
|
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
|
if (err != cudaSuccess) {
|
|
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
|
return nullptr;
|
|
}
|
|
|
|
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
|
|
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
|
}
|
|
|
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
return 128;
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
size_t size = ggml_nbytes(tensor);
|
|
int64_t ne0 = tensor->ne[0];
|
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
}
|
|
}
|
|
|
|
return size;
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
if (!ggml_backend_is_cuda(backend)) {
|
|
return false;
|
|
}
|
|
|
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
return buft_ctx->device == cuda_ctx->device;
|
|
}
|
|
|
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
/* .is_host = */ NULL,
|
|
};
|
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
static std::mutex mutex;
|
|
std::lock_guard<std::mutex> lock(mutex);
|
|
|
|
if (device >= ggml_backend_cuda_get_device_count()) {
|
|
return nullptr;
|
|
}
|
|
|
|
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
|
|
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
|
|
|
if (!ggml_backend_cuda_buffer_type_initialized) {
|
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
|
ggml_backend_cuda_buffer_types[i] = {
|
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
|
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
|
};
|
|
}
|
|
ggml_backend_cuda_buffer_type_initialized = true;
|
|
}
|
|
|
|
return &ggml_backend_cuda_buffer_types[device];
|
|
}
|
|
|
|
// cuda split buffer
|
|
|
|
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
|
int64_t min_compute_capability = INT_MAX;
|
|
int64_t max_compute_capability = INT_MIN;
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
|
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
|
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
|
}
|
|
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
|
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
switch(type) {
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_1:
|
|
case GGML_TYPE_Q5_0:
|
|
case GGML_TYPE_Q5_1:
|
|
case GGML_TYPE_Q8_0:
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
case GGML_TYPE_F16:
|
|
case GGML_TYPE_F32:
|
|
return 1;
|
|
case GGML_TYPE_Q2_K:
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
|
case GGML_TYPE_Q3_K:
|
|
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
|
case GGML_TYPE_Q4_K:
|
|
case GGML_TYPE_Q5_K:
|
|
case GGML_TYPE_Q6_K:
|
|
case GGML_TYPE_IQ2_XXS:
|
|
case GGML_TYPE_IQ2_XS:
|
|
case GGML_TYPE_IQ2_S:
|
|
case GGML_TYPE_IQ3_XXS:
|
|
case GGML_TYPE_IQ1_S:
|
|
case GGML_TYPE_IQ1_M:
|
|
case GGML_TYPE_IQ4_NL:
|
|
case GGML_TYPE_IQ4_XS:
|
|
case GGML_TYPE_IQ3_S:
|
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
default:
|
|
GGML_ASSERT(false);
|
|
}
|
|
#else
|
|
switch(type) {
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_1:
|
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
case GGML_TYPE_Q5_0:
|
|
case GGML_TYPE_Q5_1:
|
|
case GGML_TYPE_Q8_0:
|
|
return 64;
|
|
case GGML_TYPE_F16:
|
|
case GGML_TYPE_F32:
|
|
return 1;
|
|
case GGML_TYPE_Q2_K:
|
|
case GGML_TYPE_Q3_K:
|
|
case GGML_TYPE_Q4_K:
|
|
case GGML_TYPE_Q5_K:
|
|
case GGML_TYPE_IQ2_XXS:
|
|
case GGML_TYPE_IQ2_XS:
|
|
case GGML_TYPE_IQ2_S:
|
|
case GGML_TYPE_IQ3_XXS:
|
|
case GGML_TYPE_IQ1_S:
|
|
case GGML_TYPE_IQ1_M:
|
|
case GGML_TYPE_IQ4_NL:
|
|
case GGML_TYPE_IQ4_XS:
|
|
case GGML_TYPE_IQ3_S:
|
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
case GGML_TYPE_Q6_K:
|
|
return 64;
|
|
default:
|
|
GGML_ASSERT(false);
|
|
}
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
}
|
|
|
|
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
|
const int64_t nrows = ggml_nrows(tensor);
|
|
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
|
|
|
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
|
*row_low -= *row_low % rounding;
|
|
|
|
if (id == ggml_backend_cuda_get_device_count() - 1) {
|
|
*row_high = nrows;
|
|
} else {
|
|
*row_high = nrows*tensor_split[id + 1];
|
|
*row_high -= *row_high % rounding;
|
|
}
|
|
}
|
|
|
|
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
|
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
|
}
|
|
|
|
struct ggml_backend_cuda_split_buffer_type_context {
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
};
|
|
|
|
struct ggml_backend_cuda_split_buffer_context {
|
|
~ggml_backend_cuda_split_buffer_context() {
|
|
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
|
for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
|
|
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
|
if (extra->events[id][is] != nullptr) {
|
|
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
|
}
|
|
}
|
|
if (extra->data_device[id] != nullptr) {
|
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
|
}
|
|
}
|
|
delete extra;
|
|
}
|
|
}
|
|
|
|
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
|
};
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
return GGML_CUDA_NAME "_Split";
|
|
|
|
GGML_UNUSED(buffer);
|
|
}
|
|
|
|
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
|
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
delete ctx;
|
|
}
|
|
|
|
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
|
return (void *)0x1000;
|
|
|
|
GGML_UNUSED(buffer);
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
|
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
|
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
ctx->tensor_extras.push_back(extra);
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
int64_t row_low, row_high;
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
if (nrows_split == 0) {
|
|
continue;
|
|
}
|
|
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
const size_t original_size = size;
|
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
}
|
|
|
|
// FIXME: do not crash if cudaMalloc fails
|
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
|
ggml_cuda_set_device(id);
|
|
char * buf;
|
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
|
|
|
// set padding to 0 to avoid possible NaN values
|
|
if (size > original_size) {
|
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
|
}
|
|
|
|
extra->data_device[id] = buf;
|
|
|
|
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
|
}
|
|
}
|
|
tensor->extra = extra;
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
// split tensors must always be set in their entirety at once
|
|
GGML_ASSERT(offset == 0);
|
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
const size_t nb1 = tensor->nb[1];
|
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
int64_t row_low, row_high;
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
if (nrows_split == 0) {
|
|
continue;
|
|
}
|
|
|
|
const size_t offset_split = row_low*nb1;
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
const size_t original_size = size;
|
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
}
|
|
|
|
const char * buf_host = (const char *)data + offset_split;
|
|
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
|
}
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
}
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
// split tensors must always be set in their entirety at once
|
|
GGML_ASSERT(offset == 0);
|
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
const size_t nb1 = tensor->nb[1];
|
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
int64_t row_low, row_high;
|
|
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
if (nrows_split == 0) {
|
|
continue;
|
|
}
|
|
|
|
const size_t offset_split = row_low*nb1;
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
const size_t original_size = size;
|
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
}
|
|
|
|
char * buf_host = (char *)data + offset_split;
|
|
CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
|
}
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
}
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
GGML_UNUSED(buffer);
|
|
GGML_UNUSED(value);
|
|
}
|
|
|
|
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
|
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
|
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
|
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
|
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
|
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
|
/* .cpy_tensor = */ NULL,
|
|
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
|
/* .reset = */ NULL,
|
|
};
|
|
|
|
// cuda split buffer type
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
return GGML_CUDA_NAME "_Split";
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
|
// instead, we allocate them for each tensor separately in init_tensor
|
|
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
|
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
|
|
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
|
}
|
|
|
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
return 128;
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
|
|
|
size_t total_size = 0;
|
|
|
|
const int64_t ne0 = tensor->ne[0];
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
int64_t row_low, row_high;
|
|
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
|
|
|
int64_t nrows_split = row_high - row_low;
|
|
if (nrows_split == 0) {
|
|
continue;
|
|
}
|
|
|
|
total_size += ggml_nbytes_split(tensor, nrows_split);
|
|
|
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
}
|
|
}
|
|
|
|
return total_size;
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
return ggml_backend_is_cuda(backend);
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
return false;
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
|
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
|
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
|
};
|
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
|
static std::mutex mutex;
|
|
std::lock_guard<std::mutex> lock(mutex);
|
|
|
|
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
|
|
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
|
if (all_zero) {
|
|
tensor_split_arr = ggml_cuda_info().default_tensor_split;
|
|
} else {
|
|
float split_sum = 0.0f;
|
|
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
|
tensor_split_arr[i] = split_sum;
|
|
split_sum += tensor_split[i];
|
|
}
|
|
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
|
tensor_split_arr[i] /= split_sum;
|
|
}
|
|
}
|
|
|
|
auto it = buft_map.find(tensor_split_arr);
|
|
if (it != buft_map.end()) {
|
|
return &it->second;
|
|
}
|
|
|
|
struct ggml_backend_buffer_type buft {
|
|
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
|
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
|
};
|
|
|
|
auto result = buft_map.emplace(tensor_split_arr, buft);
|
|
return &result.first->second;
|
|
}
|
|
|
|
// host buffer type
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
return GGML_CUDA_NAME "_Host";
|
|
|
|
GGML_UNUSED(buft);
|
|
}
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
return GGML_CUDA_NAME "_Host";
|
|
|
|
GGML_UNUSED(buffer);
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
|
}
|
|
|
|
static void * ggml_cuda_host_malloc(size_t size) {
|
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
|
return nullptr;
|
|
}
|
|
|
|
void * ptr = nullptr;
|
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
|
if (err != cudaSuccess) {
|
|
// clear the error
|
|
cudaGetLastError();
|
|
fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
return nullptr;
|
|
}
|
|
|
|
return ptr;
|
|
}
|
|
|
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
void * ptr = ggml_cuda_host_malloc(size);
|
|
|
|
if (ptr == nullptr) {
|
|
// fallback to cpu buffer
|
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
|
}
|
|
|
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
buffer->buft = buft;
|
|
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
|
|
|
return buffer;
|
|
}
|
|
|
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
|
/* .iface = */ {
|
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
},
|
|
/* .context = */ nullptr,
|
|
};
|
|
|
|
return &ggml_backend_cuda_buffer_type_host;
|
|
}
|
|
|
|
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
|
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
|
//}
|
|
|
|
/// kernels
|
|
|
|
typedef void (*ggml_cuda_op_mul_mat_t)(
|
|
ggml_backend_cuda_context & ctx,
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
const int64_t src1_padded_row_size, cudaStream_t stream);
|
|
|
|
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
|
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
|
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
|
|
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
|
|
|
static __global__ void mul_mat_p021_f16_f32(
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
|
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
|
|
|
const half * x = (const half *) vx;
|
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
const int channel_x = channel / (nchannels_y / nchannels_x);
|
|
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_dst = nrows_x;
|
|
const int row_dst = row_x;
|
|
|
|
float tmp = 0.0f;
|
|
|
|
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
|
const int col_x = col_x0 + threadIdx.x;
|
|
|
|
if (col_x >= ncols_x) {
|
|
break;
|
|
}
|
|
|
|
// x is transposed and permuted
|
|
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
|
const float xi = __half2float(x[ix]);
|
|
|
|
const int row_y = col_x;
|
|
|
|
// y is not transposed but permuted
|
|
const int iy = channel*nrows_y + row_y;
|
|
|
|
tmp += xi * y[iy];
|
|
}
|
|
|
|
// dst is not transposed and not permuted
|
|
const int idst = channel*nrows_dst + row_dst;
|
|
|
|
// sum up partial sums and write back result
|
|
tmp = warp_reduce_sum(tmp);
|
|
|
|
if (threadIdx.x == 0) {
|
|
dst[idst] = tmp;
|
|
}
|
|
}
|
|
|
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
|
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
|
|
|
const half * x = (const half *) vx;
|
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
const int channel_x = channel / channel_x_divisor;
|
|
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_dst = nrows_x;
|
|
const int row_dst = row_x;
|
|
|
|
const int idst = channel*nrows_dst + row_dst;
|
|
|
|
float tmp = 0.0f;
|
|
|
|
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
|
const int col_x = col_x0 + threadIdx.x;
|
|
|
|
if (col_x >= ncols_x) {
|
|
break;
|
|
}
|
|
|
|
const int row_y = col_x;
|
|
|
|
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
|
const int iy = channel*nrows_y + row_y;
|
|
|
|
const float xi = __half2float(x[ix]);
|
|
|
|
tmp += xi * y[iy];
|
|
}
|
|
|
|
// sum up partial sums and write back result
|
|
tmp = warp_reduce_sum(tmp);
|
|
|
|
if (threadIdx.x == 0) {
|
|
dst[idst] = tmp;
|
|
}
|
|
}
|
|
|
|
static void ggml_mul_mat_p021_f16_f32_cuda(
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
|
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
|
|
|
const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
|
}
|
|
|
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
|
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
|
|
|
const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
|
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
|
}
|
|
|
|
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
|
|
char * src_ptr = (char *) src->data;
|
|
char * dst_ptr = (char *) dst;
|
|
|
|
const int64_t ne0 = src->ne[0];
|
|
const int64_t nb0 = src->nb[0];
|
|
const int64_t nb1 = src->nb[1];
|
|
const int64_t nb2 = src->nb[2];
|
|
const int64_t nb3 = src->nb[3];
|
|
const enum ggml_type type = src->type;
|
|
const int64_t ts = ggml_type_size(type);
|
|
const int64_t bs = ggml_blck_size(type);
|
|
int64_t i1_diff = i1_high - i1_low;
|
|
|
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
|
|
} else if (nb0 == ts) {
|
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
|
|
} else {
|
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
|
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
|
// pretend the row is a matrix with cols=1
|
|
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
|
|
if (r != cudaSuccess) {
|
|
return r;
|
|
}
|
|
}
|
|
return cudaSuccess;
|
|
}
|
|
}
|
|
|
|
static void ggml_cuda_op_mul_mat_cublas(
|
|
ggml_backend_cuda_context & ctx,
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
|
|
GGML_ASSERT(src0_dd_i != nullptr);
|
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
|
GGML_ASSERT(dst_dd_i != nullptr);
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne10 = src1->ne[0];
|
|
|
|
const int64_t ne0 = dst->ne[0];
|
|
|
|
const int64_t row_diff = row_high - row_low;
|
|
|
|
int id = ggml_cuda_get_device();
|
|
|
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
// ldc == nrows of the matrix that cuBLAS writes into
|
|
int ldc = id == ctx.device ? ne0 : row_diff;
|
|
|
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
|
|
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
|
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
|
|
if (src0->type != GGML_TYPE_F16) {
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
size_t ne = row_diff*ne00;
|
|
src0_as_f16.alloc(ne);
|
|
to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
|
|
}
|
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
|
|
|
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
|
|
if (src1->type != GGML_TYPE_F16) {
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
size_t ne = src1_ncols*ne10;
|
|
src1_as_f16.alloc(ne);
|
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
|
}
|
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
|
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
|
|
|
|
const half alpha_f16 = 1.0f;
|
|
const half beta_f16 = 0.0f;
|
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
|
CUBLAS_CHECK(
|
|
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
row_diff, src1_ncols, ne10,
|
|
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
|
src1_ptr, CUDA_R_16F, ne10,
|
|
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
|
CUBLAS_COMPUTE_16F,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
|
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
} else {
|
|
ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
|
|
ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
|
|
|
|
if (src0->type != GGML_TYPE_F32) {
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
|
}
|
|
if (src1->type != GGML_TYPE_F32) {
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
|
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
|
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
|
}
|
|
|
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
|
|
|
const float alpha = 1.0f;
|
|
const float beta = 0.0f;
|
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
|
CUBLAS_CHECK(
|
|
cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
row_diff, src1_ncols, ne10,
|
|
&alpha, src0_ddf_i, ne00,
|
|
src1_ddf1_i, ne10,
|
|
&beta, dst_dd_i, ldc));
|
|
}
|
|
|
|
GGML_UNUSED(dst);
|
|
GGML_UNUSED(src1_ddq_i);
|
|
GGML_UNUSED(src1_padded_row_size);
|
|
}
|
|
|
|
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
static bool peer_access_enabled = false;
|
|
|
|
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
|
|
|
if (peer_access_enabled == enable_peer_access) {
|
|
return;
|
|
}
|
|
|
|
#ifdef NDEBUG
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
ggml_cuda_set_device(id);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
ggml_cuda_set_device(id);
|
|
|
|
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
|
|
if (id == id_other) {
|
|
continue;
|
|
}
|
|
if (id != main_device && id_other != main_device) {
|
|
continue;
|
|
}
|
|
|
|
int can_access_peer;
|
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
|
if (can_access_peer) {
|
|
if (enable_peer_access) {
|
|
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
|
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
|
CUDA_CHECK(err);
|
|
}
|
|
} else {
|
|
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
|
if (err != cudaErrorPeerAccessNotEnabled) {
|
|
CUDA_CHECK(err);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ggml_cuda_set_device(main_device);
|
|
#endif // NDEBUG
|
|
|
|
peer_access_enabled = enable_peer_access;
|
|
|
|
GGML_UNUSED(main_device);
|
|
}
|
|
|
|
static void ggml_cuda_op_mul_mat(
|
|
ggml_backend_cuda_context & ctx,
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
|
const bool convert_src1_to_q8_1) {
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne02 = src0->ne[2];
|
|
const int64_t ne03 = src0->ne[3];
|
|
|
|
const int64_t ne10 = src1->ne[0];
|
|
const int64_t ne11 = src1->ne[1];
|
|
const int64_t ne12 = src1->ne[2];
|
|
const int64_t ne13 = src1->ne[3];
|
|
const int64_t nrows1 = ggml_nrows(src1);
|
|
|
|
GGML_ASSERT(ne03 == ne13);
|
|
|
|
const int64_t ne0 = dst->ne[0];
|
|
const int64_t ne1 = dst->ne[1];
|
|
|
|
const int nb2 = dst->nb[2];
|
|
const int nb3 = dst->nb[3];
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
|
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
|
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
|
|
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
|
|
|
const int64_t i02_divisor = ne12 / ne02;
|
|
|
|
const size_t src0_ts = ggml_type_size(src0->type);
|
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
|
const size_t q8_1_bs = QK8_1;
|
|
|
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
|
|
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
|
|
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
|
GGML_ASSERT(!(split && ne02 > 1));
|
|
GGML_ASSERT(!(split && ne03 > 1));
|
|
GGML_ASSERT(!(split && ne02 < ne12));
|
|
|
|
ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
|
|
|
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
if (split) {
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
tensor_split = buft_ctx->tensor_split;
|
|
}
|
|
|
|
struct dev_data {
|
|
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
|
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
|
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
|
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
|
|
|
char * src0_dd = nullptr;
|
|
float * src1_ddf = nullptr; // float
|
|
char * src1_ddq = nullptr; // q8_1
|
|
float * dst_dd = nullptr;
|
|
|
|
int64_t row_low;
|
|
int64_t row_high;
|
|
};
|
|
|
|
dev_data dev[GGML_CUDA_MAX_DEVICES];
|
|
|
|
int used_devices = 0;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
// by default, use all rows
|
|
dev[id].row_low = 0;
|
|
dev[id].row_high = ne01;
|
|
|
|
// for multi GPU, get the row boundaries from tensor split
|
|
// and round to mul_mat_q tile sizes
|
|
if (split) {
|
|
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
|
|
|
if (id != 0) {
|
|
dev[id].row_low = ne01*tensor_split[id];
|
|
if (dev[id].row_low < ne01) {
|
|
dev[id].row_low -= dev[id].row_low % rounding;
|
|
}
|
|
}
|
|
|
|
if (id != ggml_backend_cuda_get_device_count() - 1) {
|
|
dev[id].row_high = ne01*tensor_split[id + 1];
|
|
if (dev[id].row_high < ne01) {
|
|
dev[id].row_high -= dev[id].row_high % rounding;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
|
continue;
|
|
}
|
|
|
|
used_devices++;
|
|
|
|
const bool src1_on_device = id == src1_ctx->device;
|
|
const bool dst_on_device = id == dst_ctx->device;
|
|
|
|
ggml_cuda_set_device(id);
|
|
cudaStream_t stream = ctx.stream(id, 0);
|
|
|
|
if (src0_is_contiguous) {
|
|
dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
|
|
} else {
|
|
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
|
}
|
|
|
|
if (src1_on_device && src1_is_contiguous) {
|
|
dev[id].src1_ddf = (float *) src1->data;
|
|
} else {
|
|
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
|
}
|
|
|
|
if (convert_src1_to_q8_1) {
|
|
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
|
|
|
if (src1_on_device && src1_is_contiguous) {
|
|
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
}
|
|
}
|
|
|
|
if (dst_on_device) {
|
|
dev[id].dst_dd = (float *) dst->data;
|
|
} else {
|
|
const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
|
|
dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
|
|
}
|
|
}
|
|
|
|
// if multiple devices are used they need to wait for the main device
|
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
|
if (split && used_devices > 1) {
|
|
ggml_cuda_set_device(ctx.device);
|
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
|
|
}
|
|
|
|
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
|
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
|
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
|
continue;
|
|
}
|
|
|
|
const bool src1_on_device = id == src1_ctx->device;
|
|
const bool dst_on_device = id == dst_ctx->device;
|
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
|
|
|
ggml_cuda_set_device(id);
|
|
cudaStream_t stream = ctx.stream(id, is);
|
|
|
|
// wait for main GPU data if necessary
|
|
if (split && (id != ctx.device || is != 0)) {
|
|
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
|
|
}
|
|
|
|
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
|
const int64_t i03 = i0 / ne12;
|
|
const int64_t i02 = i0 % ne12;
|
|
|
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
|
|
|
// for split tensors the data begins at i0 == i0_offset_low
|
|
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
|
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
|
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
|
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
|
|
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
|
// in that case an offset on dst_ddf_i is needed
|
|
if (id == ctx.device) {
|
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
|
}
|
|
|
|
// copy src0, src1 to device if necessary
|
|
if (src1_is_contiguous) {
|
|
if (id != ctx.device) {
|
|
if (convert_src1_to_q8_1) {
|
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
|
|
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
|
} else {
|
|
float * src1_ddf_i_source = (float *) src1->data;
|
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
|
|
src1_ncols*ne10*sizeof(float), stream));
|
|
}
|
|
}
|
|
} else if (src1_on_device && !src1_is_contiguous) {
|
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
|
} else {
|
|
GGML_ASSERT(false);
|
|
}
|
|
|
|
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
}
|
|
|
|
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
|
}
|
|
|
|
// do the computation
|
|
op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
|
dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
// copy dst to host or other device if necessary
|
|
if (!dst_on_device) {
|
|
void * dst_off_device = dst->data;
|
|
if (split) {
|
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
|
// dst is NOT transposed.
|
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
|
cudaMemcpy3DPeerParms p = {};
|
|
p.dstDevice = ctx.device;
|
|
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
|
p.srcDevice = id;
|
|
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
|
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
|
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
|
#else
|
|
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
|
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
|
dst_dd_i, row_diff*sizeof(float),
|
|
row_diff*sizeof(float), src1_ncols,
|
|
cudaMemcpyDeviceToDevice, stream));
|
|
#endif
|
|
} else {
|
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
dhf_dst_i += src1_col_0*ne0;
|
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
|
}
|
|
}
|
|
|
|
// add event for the main device to wait on until other device is done
|
|
if (split && (id != ctx.device || is != 0)) {
|
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// main device waits for all other devices to be finished
|
|
if (split && ggml_backend_cuda_get_device_count() > 1) {
|
|
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
|
is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
|
|
|
|
ggml_cuda_set_device(ctx.device);
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
if (dev[id].row_low == dev[id].row_high) {
|
|
continue;
|
|
}
|
|
for (int64_t is = 0; is < is_max; ++is) {
|
|
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
const int64_t ne12 = src1->ne[2];
|
|
|
|
cudaStream_t main_stream = ctx.stream();
|
|
|
|
void * src0_ddq = src0->data;
|
|
float * src1_ddf = (float *) src1->data;
|
|
float * dst_ddf = (float *) dst->data;
|
|
|
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
|
}
|
|
|
|
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
const int64_t nb01 = src0->nb[1];
|
|
const int64_t nb02 = src0->nb[2];
|
|
|
|
const int64_t ne12 = src1->ne[2];
|
|
|
|
cudaStream_t main_stream = ctx.stream();
|
|
|
|
void * src0_ddq = src0->data;
|
|
float * src1_ddf = (float *) src1->data;
|
|
float * dst_ddf = (float *) dst->data;
|
|
|
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
|
const int64_t channel_stride_x = nb02 / sizeof(half);
|
|
|
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
|
}
|
|
|
|
static __global__ void k_compute_batched_ptrs(
|
|
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
|
const void ** ptrs_src, void ** ptrs_dst,
|
|
int64_t ne12, int64_t ne13,
|
|
int64_t ne23,
|
|
size_t nb02, size_t nb03,
|
|
size_t nb12, size_t nb13,
|
|
size_t nbd2, size_t nbd3,
|
|
int64_t r2, int64_t r3) {
|
|
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (i13 >= ne13 || i12 >= ne12) {
|
|
return;
|
|
}
|
|
|
|
int64_t i03 = i13 / r3;
|
|
int64_t i02 = i12 / r2;
|
|
|
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
|
}
|
|
|
|
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
|
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
const int64_t ne_dst = ggml_nelements(dst);
|
|
|
|
cudaStream_t main_stream = ctx.stream();
|
|
|
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
|
|
|
|
void * src0_ddq = src0->data;
|
|
half * src0_f16 = (half *) src0_ddq;
|
|
float * src1_ddf = (float *) src1->data;
|
|
float * dst_ddf = (float *) dst->data;
|
|
|
|
// convert src1 to fp16
|
|
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
|
if (src1->type != GGML_TYPE_F16) {
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
const int64_t ne_src1 = ggml_nelements(src1);
|
|
src1_f16_alloc.alloc(ne_src1);
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
|
}
|
|
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
|
|
|
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
|
|
char * dst_t;
|
|
|
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
|
|
|
// dst strides
|
|
size_t nbd2 = dst->nb[2];
|
|
size_t nbd3 = dst->nb[3];
|
|
|
|
const half alpha_f16 = 1.0f;
|
|
const half beta_f16 = 0.0f;
|
|
|
|
const float alpha_f32 = 1.0f;
|
|
const float beta_f32 = 0.0f;
|
|
|
|
const void * alpha = &alpha_f16;
|
|
const void * beta = &beta_f16;
|
|
|
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
dst_t = (char *) dst_f16.alloc(ne_dst);
|
|
|
|
nbd2 /= sizeof(float) / sizeof(half);
|
|
nbd3 /= sizeof(float) / sizeof(half);
|
|
} else {
|
|
dst_t = (char *) dst_ddf;
|
|
|
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
|
cu_data_type = CUDA_R_32F;
|
|
|
|
alpha = &alpha_f32;
|
|
beta = &beta_f32;
|
|
}
|
|
|
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
GGML_ASSERT(ne13 % ne03 == 0);
|
|
|
|
// broadcast factors
|
|
const int64_t r2 = ne12/ne02;
|
|
const int64_t r3 = ne13/ne03;
|
|
|
|
#if 0
|
|
// use cublasGemmEx
|
|
{
|
|
for (int i13 = 0; i13 < ne13; ++i13) {
|
|
for (int i12 = 0; i12 < ne12; ++i12) {
|
|
int i03 = i13 / r3;
|
|
int i02 = i12 / r2;
|
|
|
|
CUBLAS_CHECK(
|
|
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
|
ne01, ne11, ne10,
|
|
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
|
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
|
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
|
cu_compute_type,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
|
// use cublasGemmStridedBatchedEx
|
|
CUBLAS_CHECK(
|
|
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
ne01, ne11, ne10,
|
|
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
|
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
|
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
|
ne12*ne13,
|
|
cu_compute_type,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
} else {
|
|
// use cublasGemmBatchedEx
|
|
const int ne23 = ne12*ne13;
|
|
|
|
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
|
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
|
|
|
dim3 block_dims(ne13, ne12);
|
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
|
src0_f16, src1_f16, dst_t,
|
|
ptrs_src.get(), ptrs_dst.get(),
|
|
ne12, ne13,
|
|
ne23,
|
|
nb02, nb03,
|
|
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
|
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
|
nbd2, nbd3,
|
|
r2, r3);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
CUBLAS_CHECK(
|
|
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
|
ne01, ne11, ne10,
|
|
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
|
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
|
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
|
ne23,
|
|
cu_compute_type,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
}
|
|
#endif
|
|
|
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
|
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
|
}
|
|
}
|
|
|
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
|
|
|
int64_t min_compute_capability = INT_MAX;
|
|
|
|
bool any_pascal_with_slow_fp16 = false;
|
|
if (split) {
|
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
auto & tensor_split = buft_ctx->tensor_split;
|
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
// skip devices that are not going to do any work:
|
|
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
|
continue;
|
|
}
|
|
|
|
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
|
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
|
}
|
|
if (ggml_cuda_info().devices[id].cc == 610) {
|
|
any_pascal_with_slow_fp16 = true;
|
|
}
|
|
}
|
|
} else {
|
|
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
|
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
|
}
|
|
|
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
|
|
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
|
|
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
|
|
|
#ifdef CUDA_USE_TENSOR_CORES
|
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
|
#endif // CUDA_USE_TENSOR_CORES
|
|
|
|
#else
|
|
|
|
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
|
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
|
|
|
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
|
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
|
|
|
#ifdef CUDA_USE_TENSOR_CORES
|
|
// when tensor cores are available, use them for large batch size
|
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
|
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
#endif // CUDA_USE_TENSOR_CORES
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
// if mmvq is available it's a better choice than dmmv:
|
|
#ifndef GGML_CUDA_FORCE_DMMV
|
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
#endif // GGML_CUDA_FORCE_DMMV
|
|
|
|
// debug helpers
|
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
|
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
|
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
|
|
|
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
|
// KQ single-batch
|
|
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
|
// KQV single-batch
|
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
|
} else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
|
// KQ + KQV multi-batch
|
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
|
} else if (use_dequantize_mul_mat_vec) {
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
|
} else if (use_mul_mat_vec_q) {
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
|
} else if (use_mul_mat_q) {
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
|
} else {
|
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
template<typename ... Srcs>
|
|
static __global__ void k_compute_batched_ptrs_id(
|
|
const void ** ptrs_src, void ** ptrs_dst,
|
|
int ne12, int ne13,
|
|
int ne23,
|
|
int nb02, int nb03,
|
|
int nb12, int nb13,
|
|
int nb2, int nb3,
|
|
int r2, int r3,
|
|
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
|
const half * src1_f16, half * dst_f16,
|
|
const int32_t * ids, const int id,
|
|
Srcs... src0s) {
|
|
|
|
int i = ids[id];
|
|
|
|
half * src0_f16;
|
|
const void * srcs_ar[] = { (const half *) src0s... };
|
|
if (src0_type == GGML_TYPE_F16) {
|
|
src0_f16 = (half *) srcs_ar[i];
|
|
} else {
|
|
src0_f16 = src0_as_f16;
|
|
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
|
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
|
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
|
}
|
|
}
|
|
|
|
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (i13 >= ne13 || i12 >= ne12) {
|
|
return;
|
|
}
|
|
|
|
int i03 = i13 / r3;
|
|
int i02 = i12 / r2;
|
|
|
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
|
}
|
|
|
|
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
const struct ggml_tensor * ids = dst->src[0];
|
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
const struct ggml_tensor * src00 = dst->src[2];
|
|
|
|
const int id = dst->op_params[0];
|
|
|
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
|
|
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
|
const int64_t ne01 = src00->ne[1];
|
|
const int64_t ne02 = src00->ne[2];
|
|
const int64_t ne03 = src00->ne[3];
|
|
|
|
//const int64_t nb01 = src00->nb[1];
|
|
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
|
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
|
|
|
const int64_t ne10 = src1->ne[0];
|
|
const int64_t ne11 = src1->ne[1];
|
|
const int64_t ne12 = src1->ne[2];
|
|
const int64_t ne13 = src1->ne[3];
|
|
|
|
//const int64_t nb11 = src1->nb[1];
|
|
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
|
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
|
|
|
const int64_t ne1 = ggml_nelements(src1);
|
|
const int64_t ne = ggml_nelements(dst);
|
|
|
|
ggml_cuda_set_device(g_main_device);
|
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
|
|
|
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
|
|
|
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
|
//half * src0_as_f16 = (half *) src0_ddq;
|
|
|
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
|
|
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
|
|
|
// convert src1 to fp16
|
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
|
|
size_t src1_as = 0;
|
|
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
|
|
|
size_t dst_as = 0;
|
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
|
|
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
GGML_ASSERT(ne13 % ne03 == 0);
|
|
|
|
// broadcast factors
|
|
const int64_t r2 = ne12/ne02;
|
|
const int64_t r3 = ne13/ne03;
|
|
|
|
const half alpha_f16 = 1.0f;
|
|
const half beta_f16 = 0.0f;
|
|
|
|
// use cublasGemmBatchedEx
|
|
const int ne23 = ne12*ne13;
|
|
|
|
const void ** ptrs_src = nullptr;
|
|
void ** ptrs_dst = nullptr;
|
|
|
|
size_t ptrs_src_s = 0;
|
|
size_t ptrs_dst_s = 0;
|
|
|
|
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
|
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
|
|
|
int64_t src0_ne = ggml_nelements(src00);
|
|
half * src0_as_f16 = nullptr;
|
|
size_t src0_as = 0;
|
|
if (src00->type != GGML_TYPE_F16) {
|
|
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
|
}
|
|
|
|
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
|
dim3 block_dims(ne13, ne12);
|
|
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
|
ptrs_src, ptrs_dst,
|
|
ne12, ne13,
|
|
ne23,
|
|
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
|
nb12, nb13,
|
|
dst->nb[2], dst->nb[3],
|
|
r2, r3,
|
|
src00->type, src0_as_f16, src0_ne,
|
|
src1_as_f16, dst_f16,
|
|
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
|
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
|
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
|
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
|
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
|
);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
CUBLAS_CHECK(
|
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
|
ne01, ne11, ne10,
|
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
|
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
|
ne23,
|
|
CUBLAS_COMPUTE_16F,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
|
|
if (src0_as != 0) {
|
|
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
|
}
|
|
if (ptrs_src_s != 0) {
|
|
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
|
}
|
|
if (ptrs_dst_s != 0) {
|
|
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
|
}
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
|
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
|
|
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
|
}
|
|
#endif
|
|
|
|
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
#if 0
|
|
ggml_cuda_mul_mat_id_cublas(dst);
|
|
// TODO: mmq/mmv support
|
|
#endif
|
|
const ggml_tensor * src0 = dst->src[0];
|
|
const ggml_tensor * src1 = dst->src[1];
|
|
|
|
cudaStream_t stream = ctx.stream();
|
|
|
|
const size_t nb11 = src1->nb[1];
|
|
const size_t nb1 = dst->nb[1];
|
|
|
|
const struct ggml_tensor * ids = src0;
|
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
|
|
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
const char * ids_dev = (const char *) ids->data;
|
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
ggml_tensor src1_row = *src1;
|
|
ggml_tensor dst_row = *dst;
|
|
|
|
char * src1_original = (char *) src1->data;
|
|
char * dst_original = (char *) dst->data;
|
|
|
|
if (src1->ne[1] == 1) {
|
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
|
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
|
|
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
|
|
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
|
|
|
src1_row.data = src1_original + i01*src1->nb[1];
|
|
dst_row.data = dst_original + i01*dst->nb[1];
|
|
|
|
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
|
|
}
|
|
} else {
|
|
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
|
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
|
|
|
src1_row.data = src1_contiguous.get();
|
|
dst_row.data = dst_contiguous.get();
|
|
|
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
|
|
|
int64_t num_src1_rows = 0;
|
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
|
|
|
if (row_id_i != row_id) {
|
|
continue;
|
|
}
|
|
|
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
|
nb11, cudaMemcpyDeviceToDevice, stream));
|
|
num_src1_rows++;
|
|
}
|
|
|
|
if (num_src1_rows == 0) {
|
|
continue;
|
|
}
|
|
|
|
src1_row.ne[1] = num_src1_rows;
|
|
dst_row.ne[1] = num_src1_rows;
|
|
|
|
src1_row.nb[1] = nb11;
|
|
src1_row.nb[2] = num_src1_rows*nb11;
|
|
src1_row.nb[3] = num_src1_rows*nb11;
|
|
|
|
dst_row.nb[1] = nb1;
|
|
dst_row.nb[2] = num_src1_rows*nb1;
|
|
dst_row.nb[3] = num_src1_rows*nb1;
|
|
|
|
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
|
|
|
|
num_src1_rows = 0;
|
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
|
|
|
if (row_id_i != row_id) {
|
|
continue;
|
|
}
|
|
|
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
|
nb1, cudaMemcpyDeviceToDevice, stream));
|
|
num_src1_rows++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
|
// why is this here instead of mul_mat?
|
|
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
|
|
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
|
}
|
|
|
|
switch (dst->op) {
|
|
case GGML_OP_REPEAT:
|
|
ggml_cuda_op_repeat(ctx, dst);
|
|
break;
|
|
case GGML_OP_GET_ROWS:
|
|
ggml_cuda_op_get_rows(ctx, dst);
|
|
break;
|
|
case GGML_OP_DUP:
|
|
ggml_cuda_dup(ctx, dst);
|
|
break;
|
|
case GGML_OP_CPY:
|
|
ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
|
|
break;
|
|
case GGML_OP_CONT:
|
|
ggml_cuda_dup(ctx, dst);
|
|
break;
|
|
case GGML_OP_ADD:
|
|
ggml_cuda_op_add(ctx, dst);
|
|
break;
|
|
case GGML_OP_ACC:
|
|
ggml_cuda_op_acc(ctx, dst);
|
|
break;
|
|
case GGML_OP_MUL:
|
|
ggml_cuda_op_mul(ctx, dst);
|
|
break;
|
|
case GGML_OP_DIV:
|
|
ggml_cuda_op_div(ctx, dst);
|
|
break;
|
|
case GGML_OP_UNARY:
|
|
switch (ggml_get_unary_op(dst)) {
|
|
case GGML_UNARY_OP_GELU:
|
|
ggml_cuda_op_gelu(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_SILU:
|
|
ggml_cuda_op_silu(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
ggml_cuda_op_gelu_quick(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_TANH:
|
|
ggml_cuda_op_tanh(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_RELU:
|
|
ggml_cuda_op_relu(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
|
break;
|
|
case GGML_UNARY_OP_HARDSWISH:
|
|
ggml_cuda_op_hardswish(ctx, dst);
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
case GGML_OP_NORM:
|
|
ggml_cuda_op_norm(ctx, dst);
|
|
break;
|
|
case GGML_OP_GROUP_NORM:
|
|
ggml_cuda_op_group_norm(ctx, dst);
|
|
break;
|
|
case GGML_OP_CONCAT:
|
|
ggml_cuda_op_concat(ctx, dst);
|
|
break;
|
|
case GGML_OP_UPSCALE:
|
|
ggml_cuda_op_upscale(ctx, dst);
|
|
break;
|
|
case GGML_OP_PAD:
|
|
ggml_cuda_op_pad(ctx, dst);
|
|
break;
|
|
case GGML_OP_ARANGE:
|
|
ggml_cuda_op_arange(ctx, dst);
|
|
break;
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
ggml_cuda_op_timestep_embedding(ctx, dst);
|
|
break;
|
|
case GGML_OP_LEAKY_RELU:
|
|
ggml_cuda_op_leaky_relu(ctx, dst);
|
|
break;
|
|
case GGML_OP_RMS_NORM:
|
|
ggml_cuda_op_rms_norm(ctx, dst);
|
|
break;
|
|
case GGML_OP_MUL_MAT:
|
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
|
return false;
|
|
} else {
|
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
|
}
|
|
break;
|
|
case GGML_OP_MUL_MAT_ID:
|
|
ggml_cuda_mul_mat_id(ctx, dst);
|
|
break;
|
|
case GGML_OP_SCALE:
|
|
ggml_cuda_op_scale(ctx, dst);
|
|
break;
|
|
case GGML_OP_SQR:
|
|
ggml_cuda_op_sqr(ctx, dst);
|
|
break;
|
|
case GGML_OP_CLAMP:
|
|
ggml_cuda_op_clamp(ctx, dst);
|
|
break;
|
|
case GGML_OP_NONE:
|
|
case GGML_OP_RESHAPE:
|
|
case GGML_OP_VIEW:
|
|
case GGML_OP_PERMUTE:
|
|
case GGML_OP_TRANSPOSE:
|
|
break;
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
ggml_cuda_op_diag_mask_inf(ctx, dst);
|
|
break;
|
|
case GGML_OP_SOFT_MAX:
|
|
ggml_cuda_op_soft_max(ctx, dst);
|
|
break;
|
|
case GGML_OP_ROPE:
|
|
ggml_cuda_op_rope(ctx, dst);
|
|
break;
|
|
case GGML_OP_ALIBI:
|
|
ggml_cuda_op_alibi(ctx, dst);
|
|
break;
|
|
case GGML_OP_IM2COL:
|
|
ggml_cuda_op_im2col(ctx, dst);
|
|
break;
|
|
case GGML_OP_POOL_2D:
|
|
ggml_cuda_op_pool2d(ctx, dst);
|
|
break;
|
|
case GGML_OP_SUM_ROWS:
|
|
ggml_cuda_op_sum_rows(ctx, dst);
|
|
break;
|
|
case GGML_OP_ARGSORT:
|
|
ggml_cuda_op_argsort(ctx, dst);
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
cudaError_t err = cudaGetLastError();
|
|
if (err != cudaSuccess) {
|
|
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
|
|
GGML_ASSERT(false);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// backend
|
|
|
|
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
return cuda_ctx->name.c_str();
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
delete cuda_ctx;
|
|
delete backend;
|
|
}
|
|
|
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
|
|
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
|
|
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
|
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
|
|
|
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
|
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
|
|
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
return false;
|
|
}
|
|
|
|
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
|
return false;
|
|
}
|
|
|
|
// device -> device
|
|
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
|
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
|
|
|
if (backend_src != backend_dst) {
|
|
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
|
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
|
|
|
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
|
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
|
|
|
// copy on src stream
|
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
|
} else {
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
return false;
|
|
#else
|
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
|
#endif
|
|
}
|
|
|
|
// record event on src stream
|
|
if (!cuda_ctx_src->copy_event) {
|
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
|
}
|
|
|
|
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
|
|
|
// wait on dst stream for the copy to complete
|
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
|
} else {
|
|
// src and dst are on the same backend
|
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
|
|
|
GGML_UNUSED(backend);
|
|
}
|
|
|
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
ggml_cuda_set_device(cuda_ctx->device);
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
continue;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
if (node->src[j] != nullptr) {
|
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
|
if (!ok) {
|
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
}
|
|
GGML_ASSERT(ok);
|
|
}
|
|
|
|
return GGML_STATUS_SUCCESS;
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
switch (op->op) {
|
|
case GGML_OP_UNARY:
|
|
switch (ggml_get_unary_op(op)) {
|
|
case GGML_UNARY_OP_GELU:
|
|
case GGML_UNARY_OP_SILU:
|
|
case GGML_UNARY_OP_RELU:
|
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
case GGML_UNARY_OP_HARDSWISH:
|
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
case GGML_UNARY_OP_TANH:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
case GGML_OP_MUL_MAT:
|
|
case GGML_OP_MUL_MAT_ID:
|
|
{
|
|
struct ggml_tensor * a;
|
|
struct ggml_tensor * b;
|
|
if (op->op == GGML_OP_MUL_MAT) {
|
|
a = op->src[0];
|
|
b = op->src[1];
|
|
} else {
|
|
a = op->src[2];
|
|
b = op->src[1];
|
|
}
|
|
if (a->ne[3] != b->ne[3]) {
|
|
return false;
|
|
}
|
|
ggml_type a_type = a->type;
|
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
|
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
|
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
} break;
|
|
case GGML_OP_GET_ROWS:
|
|
{
|
|
switch (op->src[0]->type) {
|
|
case GGML_TYPE_F16:
|
|
case GGML_TYPE_F32:
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_1:
|
|
case GGML_TYPE_Q5_0:
|
|
case GGML_TYPE_Q5_1:
|
|
case GGML_TYPE_Q8_0:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
} break;
|
|
case GGML_OP_CPY:
|
|
{
|
|
ggml_type src0_type = op->src[0]->type;
|
|
ggml_type src1_type = op->src[1]->type;
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
|
return true;
|
|
}
|
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
|
return true;
|
|
}
|
|
return false;
|
|
} break;
|
|
case GGML_OP_DUP:
|
|
case GGML_OP_REPEAT:
|
|
case GGML_OP_CONCAT:
|
|
{
|
|
ggml_type src0_type = op->src[0]->type;
|
|
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
} break;
|
|
case GGML_OP_NONE:
|
|
case GGML_OP_RESHAPE:
|
|
case GGML_OP_VIEW:
|
|
case GGML_OP_PERMUTE:
|
|
case GGML_OP_TRANSPOSE:
|
|
case GGML_OP_NORM:
|
|
case GGML_OP_ADD:
|
|
case GGML_OP_MUL:
|
|
case GGML_OP_DIV:
|
|
case GGML_OP_RMS_NORM:
|
|
case GGML_OP_SCALE:
|
|
case GGML_OP_SQR:
|
|
case GGML_OP_CLAMP:
|
|
case GGML_OP_CONT:
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
case GGML_OP_SOFT_MAX:
|
|
case GGML_OP_ROPE:
|
|
case GGML_OP_ALIBI:
|
|
case GGML_OP_IM2COL:
|
|
case GGML_OP_POOL_2D:
|
|
case GGML_OP_SUM_ROWS:
|
|
case GGML_OP_ARGSORT:
|
|
case GGML_OP_ACC:
|
|
case GGML_OP_GROUP_NORM:
|
|
case GGML_OP_UPSCALE:
|
|
case GGML_OP_PAD:
|
|
case GGML_OP_ARANGE:
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
case GGML_OP_LEAKY_RELU:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
GGML_UNUSED(backend);
|
|
}
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
const int min_batch_size = 32;
|
|
|
|
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
|
|
|
GGML_UNUSED(backend);
|
|
}
|
|
|
|
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
return nullptr;
|
|
#else
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
ggml_cuda_set_device(cuda_ctx->device);
|
|
|
|
cudaEvent_t event;
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
|
|
|
return new ggml_backend_event {
|
|
/* .backend = */ backend,
|
|
/* .context = */ event,
|
|
};
|
|
#endif
|
|
}
|
|
|
|
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
|
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
|
|
|
delete event;
|
|
}
|
|
|
|
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
|
|
|
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
|
|
}
|
|
|
|
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
if (ggml_backend_is_cuda(event->backend)) {
|
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
|
|
} else {
|
|
#if 0
|
|
// untested
|
|
auto wait_fn = [](void * user_data) {
|
|
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
|
ggml_backend_event_synchronize(event);
|
|
};
|
|
|
|
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
|
#endif
|
|
GGML_ASSERT(false);
|
|
}
|
|
}
|
|
|
|
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
|
}
|
|
|
|
static ggml_backend_i ggml_backend_cuda_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_name,
|
|
/* .free = */ ggml_backend_cuda_free,
|
|
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
|
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
|
/* .graph_plan_create = */ NULL,
|
|
/* .graph_plan_free = */ NULL,
|
|
/* .graph_plan_compute = */ NULL,
|
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
|
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
|
};
|
|
|
|
static ggml_guid_t ggml_backend_cuda_guid() {
|
|
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
|
return &guid;
|
|
}
|
|
|
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
|
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
|
return nullptr;
|
|
}
|
|
|
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
|
if (ctx == nullptr) {
|
|
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
|
return nullptr;
|
|
}
|
|
|
|
ggml_backend_t cuda_backend = new ggml_backend {
|
|
/* .guid = */ ggml_backend_cuda_guid(),
|
|
/* .interface = */ ggml_backend_cuda_interface,
|
|
/* .context = */ ctx
|
|
};
|
|
|
|
return cuda_backend;
|
|
}
|
|
|
|
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
|
}
|
|
|
|
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
|
return ggml_cuda_info().device_count;
|
|
}
|
|
|
|
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
|
cudaDeviceProp prop;
|
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
|
snprintf(description, description_size, "%s", prop.name);
|
|
}
|
|
|
|
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
|
ggml_cuda_set_device(device);
|
|
|
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
|
}
|
|
|
|
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
|
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
|
if (err != cudaSuccess) {
|
|
// clear the error
|
|
cudaGetLastError();
|
|
|
|
fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
|
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
|
return;
|
|
}
|
|
|
|
cudaError_t err = cudaHostUnregister(buffer);
|
|
if (err != cudaSuccess) {
|
|
// clear the error
|
|
cudaGetLastError();
|
|
}
|
|
}
|
|
|
|
// backend registry
|
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
|
return cuda_backend;
|
|
|
|
GGML_UNUSED(params);
|
|
}
|
|
|
|
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
|
|
|
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
|
int device_count = ggml_backend_cuda_get_device_count();
|
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
|
for (int i = 0; i < device_count; i++) {
|
|
char name[128];
|
|
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
|
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
|
}
|
|
return device_count;
|
|
}
|