mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
vulkan : add backend registry / device interfaces (#9721)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* vulkan : add backend registry / device interfaces * llama : print devices used on model load
This commit is contained in:
parent
2194200278
commit
f010b77a37
@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
|||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
|||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#endif
|
#endif
|
||||||
@ -557,6 +561,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
register_backend(ggml_backend_metal_reg());
|
register_backend(ggml_backend_metal_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
register_backend(ggml_backend_vk_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
register_backend(ggml_backend_blas_reg());
|
register_backend(ggml_backend_blas_reg());
|
||||||
#endif
|
#endif
|
||||||
@ -564,7 +571,7 @@ struct ggml_backend_registry {
|
|||||||
register_backend(ggml_backend_rpc_reg());
|
register_backend(ggml_backend_rpc_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO: sycl, vulkan, kompute, cann
|
// TODO: sycl, kompute, cann
|
||||||
|
|
||||||
register_backend(ggml_backend_cpu_reg());
|
register_backend(ggml_backend_cpu_reg());
|
||||||
}
|
}
|
||||||
|
@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
if (device->fp16) {
|
if (device->fp16) {
|
||||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||||
}
|
}
|
||||||
device->name = device->properties.deviceName.data();
|
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
device_create_info = {
|
device_create_info = {
|
||||||
vk::DeviceCreateFlags(),
|
vk::DeviceCreateFlags(),
|
||||||
@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
|
|
||||||
device->buffer_type = {
|
device->buffer_type = {
|
||||||
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
|
||||||
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
// TODO: enable async and synchronize
|
||||||
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
|
/* .get_name = */ ggml_backend_vk_name,
|
||||||
|
/* .free = */ ggml_backend_vk_free,
|
||||||
|
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||||
|
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||||
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||||
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||||
|
/* .graph_plan_create = */ NULL,
|
||||||
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
|
/* .graph_plan_compute = */ NULL,
|
||||||
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
|
/* .supports_op = */ NULL,
|
||||||
|
/* .supports_buft = */ NULL,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
|
/* .event_record = */ NULL,
|
||||||
|
/* .event_wait = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_guid_t ggml_backend_vk_guid() {
|
||||||
|
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||||
|
return &guid;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||||
|
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||||
|
|
||||||
|
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||||
|
ggml_vk_init(ctx, dev_num);
|
||||||
|
|
||||||
|
ggml_backend_t vk_backend = new ggml_backend {
|
||||||
|
/* .guid = */ ggml_backend_vk_guid(),
|
||||||
|
/* .interface = */ ggml_backend_vk_interface,
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
return vk_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||||
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_backend_vk_get_device_count() {
|
||||||
|
return ggml_vk_get_device_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||||
|
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||||
|
int dev_idx = vk_instance.device_indices[device];
|
||||||
|
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||||
|
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||||
|
|
||||||
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||||
|
|
||||||
|
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||||
|
|
||||||
|
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||||
|
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||||
|
*total = heap.size;
|
||||||
|
*free = heap.size;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
|
||||||
|
struct ggml_backend_vk_device_context {
|
||||||
|
int device;
|
||||||
|
std::string name;
|
||||||
|
std::string description;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ctx->description.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||||
|
ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ggml_backend_vk_buffer_type(ctx->device);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
UNUSED(dev);
|
||||||
|
return ggml_backend_vk_host_buffer_type();
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||||
|
UNUSED(dev);
|
||||||
|
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
props->name = ggml_backend_vk_device_get_name(dev);
|
||||||
|
props->description = ggml_backend_vk_device_get_description(dev);
|
||||||
|
props->type = ggml_backend_vk_device_get_type(dev);
|
||||||
|
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
props->caps = {
|
||||||
|
/* async */ false,
|
||||||
|
/* host_buffer */ true,
|
||||||
|
/* events */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||||
|
UNUSED(params);
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ggml_backend_vk_init(ctx->device);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
|
return buft_ctx->device->idx == ctx->device;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||||
return false;
|
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||||
}
|
/* .get_memory = */ ggml_backend_vk_device_get_memory,
|
||||||
|
/* .get_type = */ ggml_backend_vk_device_get_type,
|
||||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
/* .get_props = */ ggml_backend_vk_device_get_props,
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||||
|
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||||
return buft_ctx->device == ctx->device;
|
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||||
}
|
/* .buffer_from_host_ptr = */ NULL,
|
||||||
|
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||||
// TODO: enable async and synchronize
|
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||||
static ggml_backend_i ggml_backend_vk_interface = {
|
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||||
/* .get_name = */ ggml_backend_vk_name,
|
/* .event_new = */ NULL,
|
||||||
/* .free = */ ggml_backend_vk_free,
|
/* .event_free = */ NULL,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
/* .event_synchronize = */ NULL,
|
||||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
|
||||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
|
||||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
|
||||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
|
||||||
/* .graph_plan_create = */ NULL,
|
|
||||||
/* .graph_plan_free = */ NULL,
|
|
||||||
/* .graph_plan_update = */ NULL,
|
|
||||||
/* .graph_plan_compute = */ NULL,
|
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
||||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
||||||
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
||||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
||||||
/* .event_record = */ NULL,
|
|
||||||
/* .event_wait = */ NULL,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_vk_guid() {
|
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
UNUSED(reg);
|
||||||
return &guid;
|
return GGML_VK_NAME;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
UNUSED(reg);
|
||||||
|
return ggml_backend_vk_get_device_count();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
|
||||||
ggml_vk_init(ctx, dev_num);
|
static std::vector<ggml_backend_dev_t> devices;
|
||||||
|
|
||||||
ggml_backend_t vk_backend = new ggml_backend {
|
static bool initialized = false;
|
||||||
/* .guid = */ ggml_backend_vk_guid(),
|
|
||||||
/* .interface = */ ggml_backend_vk_interface,
|
{
|
||||||
/* .device = */ nullptr,
|
static std::mutex mutex;
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
if (!initialized) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||||
|
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||||
|
char desc[256];
|
||||||
|
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||||
|
ctx->device = i;
|
||||||
|
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||||
|
ctx->description = desc;
|
||||||
|
devices.push_back(new ggml_backend_device {
|
||||||
|
/* .iface = */ ggml_backend_vk_device_i,
|
||||||
|
/* .reg = */ reg,
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(device < devices.size());
|
||||||
|
return devices[device];
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
|
||||||
|
/* .get_name = */ ggml_backend_vk_reg_get_name,
|
||||||
|
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
|
||||||
|
/* .get_device = */ ggml_backend_vk_reg_get_device,
|
||||||
|
/* .get_proc_address = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return vk_backend;
|
ggml_backend_reg_t ggml_backend_vk_reg() {
|
||||||
}
|
static ggml_backend_reg reg = {
|
||||||
|
/* .iface = */ ggml_backend_vk_reg_i,
|
||||||
|
/* .context = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
return ®
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_backend_vk_get_device_count() {
|
|
||||||
return ggml_vk_get_device_count();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
||||||
ggml_vk_get_device_description(device, description, description_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
|
||||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
|
||||||
|
|
||||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
||||||
|
|
||||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
|
||||||
|
|
||||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
|
||||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
|
||||||
*total = heap.size;
|
|
||||||
*free = heap.size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extension availability
|
// Extension availability
|
||||||
|
@ -8,9 +8,7 @@
|
|||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_SYCL)
|
||||||
# include "ggml-vulkan.h"
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
# include "ggml-sycl.h"
|
# include "ggml-sycl.h"
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
# include "ggml-kompute.h"
|
# include "ggml-kompute.h"
|
||||||
@ -3418,8 +3416,6 @@ static int llama_get_device_count(const llama_model & model) {
|
|||||||
|
|
||||||
#if defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_SYCL)
|
||||||
count += ggml_backend_sycl_get_device_count();
|
count += ggml_backend_sycl_get_device_count();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
count += ggml_backend_vk_get_device_count();
|
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
count += ggml_backend_cann_get_device_count();
|
count += ggml_backend_cann_get_device_count();
|
||||||
#endif
|
#endif
|
||||||
@ -3451,10 +3447,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
|
|||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_vk_host_buffer_type();
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
@ -3473,9 +3465,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||||||
}
|
}
|
||||||
device -= (int)model.devices.size();
|
device -= (int)model.devices.size();
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_SYCL)
|
||||||
buft = ggml_backend_vk_buffer_type(device);
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
buft = ggml_backend_sycl_buffer_type(device);
|
buft = ggml_backend_sycl_buffer_type(device);
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
buft = ggml_backend_kompute_buffer_type(device);
|
buft = ggml_backend_kompute_buffer_type(device);
|
||||||
@ -3535,11 +3525,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
||||||
return free;
|
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
@ -19095,8 +19080,7 @@ bool llama_supports_mlock(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -19227,10 +19211,15 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
||||||
|
{
|
||||||
|
size_t free, total; // NOLINT
|
||||||
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||||
model->devices.push_back(dev);
|
model->devices.push_back(dev);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int status = llama_model_load(path_model, *model, params);
|
int status = llama_model_load(path_model, *model, params);
|
||||||
GGML_ASSERT(status <= 0);
|
GGML_ASSERT(status <= 0);
|
||||||
@ -19423,32 +19412,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
main_gpu -= (int)model->devices.size();
|
main_gpu -= (int)model->devices.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_SYCL)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
||||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
||||||
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
|
||||||
ggml_backend_t backend = ggml_backend_vk_init(device);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
||||||
|
Loading…
Reference in New Issue
Block a user