mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-10 02:31:46 +00:00
ggml-ci
This commit is contained in:
parent
63c47ab8c3
commit
484984c8ec
@ -76,15 +76,15 @@ static T stdev(const std::vector<T> & v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_cpu_info() {
|
static std::string get_cpu_info() {
|
||||||
std::vector<std::string> gpu_list;
|
std::vector<std::string> cpu_list;
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
auto * dev = ggml_backend_dev_get(i);
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
auto dev_type = ggml_backend_dev_type(dev);
|
auto dev_type = ggml_backend_dev_type(dev);
|
||||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
||||||
gpu_list.push_back(ggml_backend_dev_description(dev));
|
cpu_list.push_back(ggml_backend_dev_description(dev));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return join(gpu_list, ", ");
|
return join(cpu_list, ", ");
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
|
@ -168,10 +168,13 @@ extern "C" {
|
|||||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||||
|
|
||||||
|
|
||||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||||
|
|
||||||
|
// Split buffer type for tensor parallelism
|
||||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||||
|
// Set the number of threads for the backend
|
||||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||||
|
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -95,8 +95,8 @@ extern "C" {
|
|||||||
// (optional) complete all pending operations (required if the backend supports async operations)
|
// (optional) complete all pending operations (required if the backend supports async operations)
|
||||||
void (*synchronize)(ggml_backend_t backend);
|
void (*synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// (optional) graph plans
|
// (optional) graph plans (not used currently)
|
||||||
// compute graph with a plan (not used currently)
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||||
|
@ -1503,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
||||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
||||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||||
@ -1906,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// check if a weight is on a different backend
|
// check if a weight is on a different and incompatible backend
|
||||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_backend_id != cur_backend_id) {
|
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1922,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||||
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -3168,7 +3168,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
GGML_ASSERT(buft != nullptr);
|
|
||||||
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3423,8 +3423,8 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename F>
|
template<typename F>
|
||||||
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t * buft_list, const F & fn) {
|
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||||
for (const auto & cur : *buft_list) {
|
for (const auto & cur : buft_list) {
|
||||||
ggml_backend_dev_t cur_dev = cur.first;
|
ggml_backend_dev_t cur_dev = cur.first;
|
||||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||||
@ -3499,7 +3499,7 @@ static bool llama_kv_cache_init(
|
|||||||
} else {
|
} else {
|
||||||
buft_list = &model.cpu_buft_list;
|
buft_list = &model.cpu_buft_list;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_type_t buft = select_buft(buft_list,
|
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
||||||
[&](ggml_context * ctx) {
|
[&](ggml_context * ctx) {
|
||||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||||
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
||||||
@ -6955,7 +6955,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////// TODO: move elsewhere, maybe
|
|
||||||
enum llm_tensor_layer {
|
enum llm_tensor_layer {
|
||||||
LLM_TENSOR_LAYER_INPUT,
|
LLM_TENSOR_LAYER_INPUT,
|
||||||
LLM_TENSOR_LAYER_REPEATING,
|
LLM_TENSOR_LAYER_REPEATING,
|
||||||
@ -7093,7 +7092,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// checks if the weight tensor can be used with the specified buffer type and device
|
// checks if the weight tensor can be used with the specified buffer type and device
|
||||||
static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||||
GGML_ASSERT(w != nullptr);
|
GGML_ASSERT(w != nullptr);
|
||||||
|
|
||||||
if (op == GGML_OP_NONE) {
|
if (op == GGML_OP_NONE) {
|
||||||
@ -7125,7 +7124,7 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
int n_expert_used = 2; // TODO: from model
|
int n_expert_used = hparams.n_expert_used;
|
||||||
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||||
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
||||||
@ -7147,8 +7146,8 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
int n_embd_head = 64; // TODO: from model
|
int n_embd_head = hparams.n_embd_head_v;
|
||||||
int n_head = 16;
|
int n_head = hparams.n_head();
|
||||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||||
op_tensor = ggml_rope_ext(
|
op_tensor = ggml_rope_ext(
|
||||||
@ -7190,12 +7189,12 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
|
|||||||
}
|
}
|
||||||
|
|
||||||
// find the first buffer type in the list that can use the tensor
|
// find the first buffer type in the list that can use the tensor
|
||||||
static ggml_backend_buffer_type_t select_weight_buft(ggml_tensor * tensor, ggml_op op, llama_model::buft_list_t * buft_list) {
|
static ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) {
|
||||||
GGML_ASSERT(!buft_list->empty());
|
GGML_ASSERT(!buft_list.empty());
|
||||||
for (auto & cur : *buft_list) {
|
for (const auto & cur : buft_list) {
|
||||||
ggml_backend_dev_t cur_dev = cur.first;
|
ggml_backend_dev_t cur_dev = cur.first;
|
||||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||||
if (weight_buft_supported(tensor, op, cur_buft, cur_dev)) {
|
if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
|
||||||
return cur_buft;
|
return cur_buft;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -7420,8 +7419,6 @@ static bool llm_load_tensors(
|
|||||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||||
|
|
||||||
|
|
||||||
constexpr auto * func = __func__;
|
|
||||||
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
||||||
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
||||||
|
|
||||||
@ -7482,7 +7479,7 @@ static bool llm_load_tensors(
|
|||||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t buft = select_weight_buft(t_meta, op, buft_list);
|
ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
|
||||||
if (!buft) {
|
if (!buft) {
|
||||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||||
}
|
}
|
||||||
@ -7512,8 +7509,7 @@ static bool llm_load_tensors(
|
|||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_tensor * t = ml.create_tensor(ctx, tn, ne, flags);
|
return ml.create_tensor(ctx, tn, ne, flags);
|
||||||
return t;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
@ -9064,11 +9060,10 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (n_moved_tensors > 0) {
|
if (n_moved_tensors > 0) {
|
||||||
LLAMA_LOG_WARN("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||||
func, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
||||||
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ml.done_getting_tensors();
|
ml.done_getting_tensors();
|
||||||
@ -9146,7 +9141,7 @@ static bool llm_load_tensors(
|
|||||||
|
|
||||||
for (auto & buf : bufs) {
|
for (auto & buf : bufs) {
|
||||||
// indicate that this buffer contains weights
|
// indicate that this buffer contains weights
|
||||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||||
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -19517,7 +19512,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// GPU backends
|
||||||
for (auto * dev : model->devices) {
|
for (auto * dev : model->devices) {
|
||||||
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
@ -19528,7 +19523,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add other backends (such as BLAS)
|
// add ACCEL backends (such as BLAS)
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
||||||
@ -19542,6 +19537,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add CPU backend
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
@ -19638,11 +19634,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto * dev = ggml_backend_get_device(backend);
|
auto * dev = ggml_backend_get_device(backend);
|
||||||
if (!dev) {
|
|
||||||
// backend is using old interface, not supported
|
|
||||||
pipeline_parallel = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ggml_backend_dev_props props;
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
if (!props.caps.async || !props.caps.events) {
|
if (!props.caps.async || !props.caps.events) {
|
||||||
@ -19667,17 +19658,19 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
|
ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
|
||||||
|
|
||||||
|
// reserve pp graph first so that buffers are only allocated once
|
||||||
ggml_backend_sched_reserve(ctx->sched, gf_pp);
|
ggml_backend_sched_reserve(ctx->sched, gf_pp);
|
||||||
int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
|
int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
|
||||||
|
|
||||||
|
// reserve with tg graph to get the number of splits and nodes
|
||||||
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
|
ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
|
||||||
ggml_backend_sched_reserve(ctx->sched, gf_tg);
|
ggml_backend_sched_reserve(ctx->sched, gf_tg);
|
||||||
int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
|
int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
|
||||||
|
|
||||||
// restore
|
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||||
gf_pp = llama_build_graph(*ctx, ubatch_pp, false);
|
gf_pp = llama_build_graph(*ctx, ubatch_pp, false);
|
||||||
if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
|
if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
@ -19989,7 +19982,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
cvec.tensors.reserve(model.hparams.n_layer);
|
cvec.tensors.reserve(model.hparams.n_layer);
|
||||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||||
ggml_backend_buffer_type_t buft = select_buft(model.dev_layer.at(il).buft_list,
|
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
|
||||||
[&](ggml_context * ctx) {
|
[&](ggml_context * ctx) {
|
||||||
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||||
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||||
|
Loading…
Reference in New Issue
Block a user