mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 19:50:17 +00:00
kompute: add backend registry / device interfaces (#10045)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Get in line with the other backends by supporting the newer backend/device registry interfaces. Signed-off-by: Sergio Lopez <slp@redhat.com>
This commit is contained in:
parent
b9e02e8184
commit
61408e7fad
@ -11,6 +11,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_KOMPUTE_MAX_DEVICES 16
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
@ -41,6 +43,8 @@ GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -562,6 +562,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_t> backends;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
@ -591,8 +595,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_AMX
|
||||
register_backend(ggml_backend_amx_reg());
|
||||
#endif
|
||||
|
||||
// TODO: kompute
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
}
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@ -273,18 +274,9 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
|
||||
return results;
|
||||
}
|
||||
|
||||
// public API returns a C-style array
|
||||
ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) {
|
||||
auto devices = ggml_vk_available_devices_internal(memoryRequired);
|
||||
*count = devices.size();
|
||||
if (devices.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
|
||||
auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
|
||||
memcpy(arr, devices.data(), nbytes);
|
||||
return arr;
|
||||
static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
|
||||
static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
|
||||
return devices;
|
||||
}
|
||||
|
||||
static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
|
||||
@ -341,7 +333,7 @@ ggml_vk_device ggml_vk_current_device() {
|
||||
if (!komputeManager()->hasDevice())
|
||||
return ggml_vk_device();
|
||||
|
||||
auto devices = ggml_vk_available_devices_internal(0);
|
||||
auto devices = ggml_vk_available_devices();
|
||||
ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
|
||||
GGML_ASSERT(!devices.empty());
|
||||
return devices.front();
|
||||
@ -1323,17 +1315,7 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
|
||||
ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@ -1410,6 +1392,8 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
||||
;
|
||||
}
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
|
||||
@ -1458,11 +1442,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||
|
||||
any_commands_recorded = true;
|
||||
|
||||
if (!ggml_vk_supports_op(dst)) {
|
||||
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
const int32_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int32_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int32_t ne02 = src0 ? src0->ne[2] : 0;
|
||||
@ -1907,25 +1886,31 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
|
||||
static std::vector<ggml_backend_buffer_type> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type> vec;
|
||||
auto devices = ggml_vk_available_devices_internal(0);
|
||||
vec.reserve(devices.size());
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
for (const auto & dev : devices) {
|
||||
vec.push_back({
|
||||
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
|
||||
});
|
||||
auto devices = ggml_vk_available_devices();
|
||||
int32_t device_count = (int32_t) devices.size();
|
||||
GGML_ASSERT(device < device_count);
|
||||
GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
|
||||
|
||||
static ggml_backend_buffer_type
|
||||
ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_kompute_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_kompute_buffer_type_initialized) {
|
||||
for (int32_t i = 0; i < device_count; i++) {
|
||||
ggml_backend_kompute_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
|
||||
/* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
|
||||
};
|
||||
}
|
||||
return vec;
|
||||
}();
|
||||
ggml_backend_kompute_buffer_type_initialized = true;
|
||||
}
|
||||
|
||||
auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
|
||||
return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
|
||||
});
|
||||
return it < bufts.end() ? &*it : nullptr;
|
||||
return &ggml_backend_kompute_buffer_types[device];
|
||||
}
|
||||
|
||||
// backend
|
||||
@ -1953,16 +1938,6 @@ static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, st
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
GGML_UNUSED(backend);
|
||||
return ggml_vk_supports_op(op);
|
||||
}
|
||||
|
||||
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(backend);
|
||||
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
||||
}
|
||||
|
||||
static struct ggml_backend_i kompute_backend_i = {
|
||||
/* .get_name = */ ggml_backend_kompute_name,
|
||||
/* .free = */ ggml_backend_kompute_free,
|
||||
@ -1991,7 +1966,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
||||
ggml_backend_t kompute_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_kompute_guid(),
|
||||
/* .interface = */ kompute_backend_i,
|
||||
/* .device = */ nullptr,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
|
||||
/* .context = */ s_kompute_context,
|
||||
};
|
||||
|
||||
@ -2001,3 +1976,167 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
||||
bool ggml_backend_is_kompute(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
|
||||
}
|
||||
|
||||
static size_t ggml_backend_kompute_get_device_count() {
|
||||
auto devices = ggml_vk_available_devices();
|
||||
return devices.size();
|
||||
}
|
||||
|
||||
static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
|
||||
auto devices = ggml_vk_available_devices();
|
||||
GGML_ASSERT((size_t) device < devices.size());
|
||||
snprintf(description, description_size, "%s", devices[device].name);
|
||||
}
|
||||
|
||||
static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
auto devices = ggml_vk_available_devices();
|
||||
GGML_ASSERT((size_t) device < devices.size());
|
||||
*total = devices[device].heapSize;
|
||||
*free = devices[device].heapSize;
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_kompute_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
ggml_backend_kompute_get_device_memory(ctx->device, free, total);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
return ggml_backend_kompute_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
|
||||
|
||||
return buft_ctx->device == ctx->device;
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_kompute_device_get_name(dev);
|
||||
props->description = ggml_backend_kompute_device_get_description(dev);
|
||||
props->type = ggml_backend_kompute_device_get_type(dev);
|
||||
ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async = */ false,
|
||||
/* host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
GGML_UNUSED(params);
|
||||
ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
|
||||
return ggml_backend_kompute_init(ctx->device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
|
||||
/* .get_name = */ ggml_backend_kompute_device_get_name,
|
||||
/* .get_description = */ ggml_backend_kompute_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_kompute_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_kompute_device_get_type,
|
||||
/* .get_props = */ ggml_backend_kompute_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_kompute_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_kompute_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_kompute_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_kompute_device_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
|
||||
GGML_UNUSED(reg);
|
||||
return "Kompute";
|
||||
}
|
||||
|
||||
static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
GGML_UNUSED(reg);
|
||||
return ggml_backend_kompute_get_device_count();
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
static bool initialized = false;
|
||||
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
|
||||
ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
|
||||
char desc[256];
|
||||
ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
|
||||
ctx->device = i;
|
||||
ctx->name = "Kompute" + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_kompute_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(device < devices.size());
|
||||
return devices[device];
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
|
||||
/* .get_name = */ ggml_backend_kompute_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_kompute_reg_get_device,
|
||||
/* .get_proc_address = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_kompute_reg() {
|
||||
static ggml_backend_reg reg = {
|
||||
/* .iface = */ ggml_backend_kompute_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return ®
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user