add device props/caps, fully support async upload for all compatible backends

This commit is contained in:
slaren 2024-10-02 01:23:54 +02:00
parent 805fea97ac
commit 6ff0e7a32e
6 changed files with 221 additions and 93 deletions

View File

@ -24,7 +24,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
@ -53,6 +53,9 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
//
@ -88,38 +91,61 @@ extern "C" {
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event, ggml_backend_t backend);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
//
// Events
//
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
//
// Backend device
//
enum ggml_backend_device_type {
enum ggml_backend_dev_type {
GGML_BACKEND_DEVICE_TYPE_CPU,
GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS)
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
};
// functionality supported by the device
struct ggml_backend_dev_caps {
// asynchronous operations
bool async;
// pinned host buffer
bool host_buffer;
// event synchronization
bool events;
};
// all the device properties
struct ggml_backend_dev_props {
const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
@ -130,8 +156,6 @@ extern "C" {
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
GGML_API ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device);
//
// Backend (reg)
//
@ -158,16 +182,16 @@ extern "C" {
GGML_API size_t ggml_backend_dev_count(void);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
// Set the log callback for all registered backends
GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
// Direct Backend (stream) initialization
// Direct backend (stream) initialization
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params);
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
@ -276,7 +300,6 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
//
// CPU backend
//

View File

@ -9,10 +9,9 @@ extern "C" {
#endif
//
// Backend buffer
// Backend buffer type
//
// buffer type
struct ggml_backend_buffer_type_i {
const char * (*get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type
@ -33,7 +32,10 @@ extern "C" {
void * context;
};
// buffer
//
// Backend buffer
//
struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
@ -143,15 +145,26 @@ extern "C" {
};
//
// Backend registry v2
// Backend device
//
// Note: if additional properties are needed, we should add a struct with all of them
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
struct ggml_backend_device_i {
// device properties
// device name: short identifier for this device, such as "CPU" or "CUDA0"
const char * (*get_name)(ggml_backend_dev_t dev);
// device description: short informative description of the device, could be the model name
const char * (*get_description)(ggml_backend_dev_t dev);
// device memory in bytes
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
enum ggml_backend_device_type (*get_type)(ggml_backend_dev_t dev);
// device type
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
// device properties
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
// get the backend (reg) associated with this device
ggml_backend_reg_t (*get_backend_reg)(ggml_backend_dev_t dev);
@ -190,6 +203,10 @@ extern "C" {
void * context;
};
//
// Backend (reg)
//
struct ggml_backend_reg_i {
const char * (*get_name)(ggml_backend_reg_t reg);
@ -212,7 +229,7 @@ extern "C" {
};
// Internal API
// Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function

View File

@ -331,6 +331,10 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
return false;
}
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
return backend->device;
}
// backend copy
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@ -440,10 +444,14 @@ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t *
device->iface.get_memory(device, free, total);
}
enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device) {
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
return device->iface.get_type(device);
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
device->iface.get_props(device, props);
}
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
return device->iface.get_backend_reg(device);
}
@ -476,13 +484,6 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
return device->iface.offload_op(device, op);
}
ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device) {
if (!device->iface.event_new) {
return NULL;
}
return device->iface.event_new(device);
}
// Backend (reg)
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
@ -603,7 +604,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
return NULL;
}
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type) {
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == type) {
@ -629,7 +630,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params) {
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) {
return NULL;
@ -1028,60 +1029,72 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
////////////////////////
static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t device) {
static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t dev) {
return "CPU";
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t device) {
static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t dev) {
// TODO
return "CPU";
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static void ggml_backend_cpu_device_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
static void ggml_backend_cpu_device_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static enum ggml_backend_device_type ggml_backend_cpu_device_type(ggml_backend_dev_t device) {
static enum ggml_backend_dev_type ggml_backend_cpu_device_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t device) {
static void ggml_backend_cpu_device_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_cpu_device_name(dev);
props->description = ggml_backend_cpu_device_description(dev);
props->type = ggml_backend_cpu_device_type(dev);
ggml_backend_cpu_device_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ false,
/* events */ false,
};
}
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t dev) {
return ggml_backend_cpu_reg();
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t device, const char * params) {
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init();
GGML_UNUSED(device);
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(device);
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
}
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
switch (op->op) {
case GGML_OP_CPY:
return
@ -1101,13 +1114,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const
return true;
}
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(device);
GGML_UNUSED(dev);
}
struct ggml_backend_device_i ggml_backend_cpu_device_i = {
@ -1115,6 +1128,7 @@ struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .get_description = */ ggml_backend_cpu_device_description,
/* .get_memory = */ ggml_backend_cpu_device_memory,
/* .get_type = */ ggml_backend_cpu_device_type,
/* .get_props = */ ggml_backend_cpu_device_props,
/* .get_backend_reg = */ ggml_backend_cpu_device_reg,
/* .init_backend = */ ggml_backend_cpu_device_init,
/* .buffer_type = */ ggml_backend_cpu_device_buffer_type,

View File

@ -2929,11 +2929,31 @@ static void ggml_backend_cuda_device_memory(ggml_backend_dev_t dev, size_t * fre
CUDA_CHECK(cudaMemGetInfo(free, total));
}
static enum ggml_backend_device_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) {
static enum ggml_backend_dev_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
}
static void ggml_backend_cuda_device_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_name(dev);
props->description = ggml_backend_cuda_device_description(dev);
props->type = ggml_backend_cuda_device_type(dev);
ggml_backend_cuda_device_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
#else
bool events = true;
#endif
props->caps = {
/* async */ true,
/* host_buffer */ host_buffer,
/* events */ events,
};
}
static ggml_backend_reg_t ggml_backend_cuda_device_reg(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return ggml_backend_cuda_reg();
@ -3206,7 +3226,6 @@ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const gg
GGML_UNUSED(dev);
}
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
#ifdef GGML_CUDA_NO_PEER_COPY
return nullptr;
@ -3242,6 +3261,7 @@ static ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_description = */ ggml_backend_cuda_device_description,
/* .get_memory = */ ggml_backend_cuda_device_memory,
/* .get_type = */ ggml_backend_cuda_device_type,
/* .get_props = */ ggml_backend_cuda_device_props,
/* .get_backend_reg = */ ggml_backend_cuda_device_reg,
/* .init_backend = */ ggml_backend_cuda_device_init,
/* .buffer_type = */ ggml_backend_cuda_device_buffer_type,

View File

@ -5035,7 +5035,7 @@ struct llama_model_loader {
// Returns false if cancelled by progress_callback
bool load_all_data(
struct ggml_context * ctx,
llama_buf_map & bufs_mmap,
llama_buf_map & bufs,
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
@ -5044,7 +5044,6 @@ struct llama_model_loader {
std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
// TODO: adapt to ggml-backend
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
@ -5054,26 +5053,84 @@ struct llama_model_loader {
std::vector<ggml_backend_event_t> events;
std::vector<void *> host_ptrs;
size_t buffer_idx = 0; // buffer to use for async loads
// TODO: only do this if the backend supports all the required features: async, events, pinned memory
// it also must be avoided for split buffers and other buffers that require the entire tensor to be loaded at once
ggml_backend_t upload_backend = nullptr;
if (!use_mmap && !check_tensors) {
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
if (use_mmap || check_tensors) {
return nullptr;
}
// When not using mmaped io use async uploads from pinned memory to GPU memory.
// First determine if the CUDA backend is active, and if so, determine the device ID.
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
ggml_backend_dev_t dev = buf ? ggml_backend_buft_get_device(ggml_backend_buffer_get_type(buf)) : nullptr;
ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr;
upload_backend = host_buft ? ggml_backend_dev_init(dev, nullptr) : nullptr;
// First determine if the backend supports the necessary features for async uploads.
auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
if (!buf) {
LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
return nullptr;
}
// If the cuda is active create pinned memory buffers and events for synchronisation.
if (upload_backend) {
auto * buft = ggml_backend_buffer_get_type(buf);
auto * dev = ggml_backend_buft_get_device(buft);
if (!dev) {
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
ggml_backend_buft_name(buft));
return nullptr;
}
if (buft != ggml_backend_dev_buffer_type(dev)) {
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
return nullptr;
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
if (!host_buft) {
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(host_buft, buffer_size));
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
events.emplace_back(ggml_backend_dev_event_new(dev));
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
host_buffers.emplace_back(buf);
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
auto * event = ggml_backend_event_new(dev);
if (!event) {
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
events.emplace_back(event);
}
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
if (!backend) {
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
return backend;
}(__func__);
if (upload_backend) {
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
ggml_backend_name(upload_backend));
}
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
@ -5094,8 +5151,8 @@ struct llama_model_loader {
if (use_mmap) {
const auto & mapping = mappings.at(weight->idx);
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.count(weight->idx)) {
buf_mmap = bufs_mmap.at(weight->idx);
if (bufs.count(weight->idx)) {
buf_mmap = bufs.at(weight->idx);
}
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
@ -5131,7 +5188,7 @@ struct llama_model_loader {
}));
}
} else {
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
file->seek(weight->offs, SEEK_SET);

View File

@ -672,14 +672,11 @@ struct test_case {
}
// run
ggml_backend_synchronize(backend);
int64_t total_time_us = 0;
int total_runs = 0;
do {
int64_t start_time = ggml_time_us();
ggml_backend_graph_compute(backend, gf);
ggml_backend_synchronize(backend);
int64_t end_time = ggml_time_us();
total_time_us += end_time - start_time;