mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
naming : normalize the name of callback-related identifiers
ggml-ci
This commit is contained in:
parent
c4965a64f7
commit
cc1c017191
@ -1018,7 +1018,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.attention_type = params.attention_type;
|
cparams.attention_type = params.attention_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
cparams.cb_eval = params.cb_eval;
|
cparams.cb_eval = params.cb_eval;
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_ctx = params.cb_eval_ctx;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
|
@ -173,8 +173,8 @@ struct gpt_params {
|
|||||||
struct cpu_params draft_cpuparams;
|
struct cpu_params draft_cpuparams;
|
||||||
struct cpu_params draft_cpuparams_batch;
|
struct cpu_params draft_cpuparams_batch;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_ctx = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ static void print_usage(int, char ** argv) {
|
|||||||
|
|
||||||
|
|
||||||
// cb_eval is reused for each pair of positive - negative prompt
|
// cb_eval is reused for each pair of positive - negative prompt
|
||||||
struct callback_data {
|
struct callback_context {
|
||||||
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
|
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
|
||||||
|
|
||||||
int n_layers = 0;
|
int n_layers = 0;
|
||||||
@ -155,7 +155,7 @@ struct callback_data {
|
|||||||
return diff_filtered;
|
return diff_filtered;
|
||||||
}
|
}
|
||||||
|
|
||||||
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
|
// we don't implement destructor, because we want to reuse callback_context. we just want to free the tensors
|
||||||
void reset() {
|
void reset() {
|
||||||
for (auto ptr : v_pos) free(ptr->data);
|
for (auto ptr : v_pos) free(ptr->data);
|
||||||
for (auto ptr : v_neg) free(ptr->data);
|
for (auto ptr : v_neg) free(ptr->data);
|
||||||
@ -320,7 +320,7 @@ static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
auto * cb_data = (callback_data *) user_data;
|
auto * cb_ctx = (callback_context *) user_data;
|
||||||
static const char * l_out_name = "l_out";
|
static const char * l_out_name = "l_out";
|
||||||
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
|
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
|
||||||
|
|
||||||
@ -328,12 +328,12 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
|||||||
return is_l_out;
|
return is_l_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
|
if (!is_l_out || t->ne[1] != cb_ctx->n_tokens) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// save the tensor to current context
|
// save the tensor to current context
|
||||||
cb_data->save_tensor_for_layer(t);
|
cb_ctx->save_tensor_for_layer(t);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -400,12 +400,12 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
callback_data cb_data;
|
callback_context cb_ctx;
|
||||||
|
|
||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
params.cb_eval = cb_eval;
|
params.cb_eval = cb_eval;
|
||||||
params.cb_eval_user_data = &cb_data;
|
params.cb_eval_ctx = &cb_ctx;
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
@ -445,8 +445,8 @@ int main(int argc, char ** argv) {
|
|||||||
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
bool success = false;
|
bool success = false;
|
||||||
tokenized_prompt t = tokenized_prompts[i];
|
tokenized_prompt t = tokenized_prompts[i];
|
||||||
cb_data.n_layers = n_layers;
|
cb_ctx.n_layers = n_layers;
|
||||||
cb_data.n_tokens = t.max_seq_len;
|
cb_ctx.n_tokens = t.max_seq_len;
|
||||||
|
|
||||||
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
|
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
|
||||||
(int) i+1, (int) ctx_train.positive_entries.size(),
|
(int) i+1, (int) ctx_train.positive_entries.size(),
|
||||||
@ -454,22 +454,22 @@ int main(int argc, char ** argv) {
|
|||||||
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
|
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
|
||||||
(int) t.max_seq_len);
|
(int) t.max_seq_len);
|
||||||
|
|
||||||
cb_data.is_eval_pos = true;
|
cb_ctx.is_eval_pos = true;
|
||||||
success = get_hidden_layers(ctx, t.tokens_pos);
|
success = get_hidden_layers(ctx, t.tokens_pos);
|
||||||
if (!success) break;
|
if (!success) break;
|
||||||
|
|
||||||
cb_data.is_eval_pos = false;
|
cb_ctx.is_eval_pos = false;
|
||||||
success = get_hidden_layers(ctx, t.tokens_neg);
|
success = get_hidden_layers(ctx, t.tokens_neg);
|
||||||
if (!success) break;
|
if (!success) break;
|
||||||
|
|
||||||
// calculate diff and remove all zero rows
|
// calculate diff and remove all zero rows
|
||||||
auto v_diff_filtered = cb_data.calc_diff();
|
auto v_diff_filtered = cb_ctx.calc_diff();
|
||||||
|
|
||||||
// save & concat the filtered v_diff to ctx_train
|
// save & concat the filtered v_diff to ctx_train
|
||||||
ctx_train.concat_diff_tmp(v_diff_filtered);
|
ctx_train.concat_diff_tmp(v_diff_filtered);
|
||||||
|
|
||||||
// reset for next iteration
|
// reset for next iteration
|
||||||
cb_data.reset();
|
cb_ctx.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// done with the model, we can now free it to make gain some memory
|
// done with the model, we can now free it to make gain some memory
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* This the arbitrary data which will be passed to each callback.
|
* This the arbitrary data which will be passed to each callback.
|
||||||
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||||
*/
|
*/
|
||||||
struct callback_data {
|
struct callback_context {
|
||||||
std::vector<uint8_t> data;
|
std::vector<uint8_t> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
|
|||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
static void ggml_print_tensor(const uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||||
GGML_ASSERT(n > 0);
|
GGML_ASSERT(n > 0);
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
float v;
|
float v;
|
||||||
if (type == GGML_TYPE_F16) {
|
if (type == GGML_TYPE_F16) {
|
||||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||||
} else if (type == GGML_TYPE_F32) {
|
} else if (type == GGML_TYPE_F32) {
|
||||||
v = *(float *) &data[i];
|
v = *(const float *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I32) {
|
} else if (type == GGML_TYPE_I32) {
|
||||||
v = (float) *(int32_t *) &data[i];
|
v = (float) *(const int32_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I16) {
|
} else if (type == GGML_TYPE_I16) {
|
||||||
v = (float) *(int16_t *) &data[i];
|
v = (float) *(const int16_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I8) {
|
} else if (type == GGML_TYPE_I8) {
|
||||||
v = (float) *(int8_t *) &data[i];
|
v = (float) *(const int8_t *) &data[i];
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
@ -88,7 +88,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
* @return true to receive data or continue the graph, false otherwise
|
* @return true to receive data or continue the graph, false otherwise
|
||||||
*/
|
*/
|
||||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
auto * cb_data = (callback_data *) user_data;
|
auto * cb_ctx = (callback_context *) user_data;
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = t->src[0];
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
const struct ggml_tensor * src1 = t->src[1];
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
@ -114,12 +114,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|||||||
|
|
||||||
if (!is_host) {
|
if (!is_host) {
|
||||||
auto n_bytes = ggml_nbytes(t);
|
auto n_bytes = ggml_nbytes(t);
|
||||||
cb_data->data.resize(n_bytes);
|
cb_ctx->data.resize(n_bytes);
|
||||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
ggml_backend_tensor_get(t, cb_ctx->data.data(), 0, n_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ggml_is_quantized(t->type)) {
|
if (!ggml_is_quantized(t->type)) {
|
||||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
uint8_t * data = is_host ? (uint8_t *) t->data : cb_ctx->data.data();
|
||||||
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,7 +140,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
callback_data cb_data;
|
callback_context cb_ctx;
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
|
|||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
params.cb_eval = ggml_debug;
|
params.cb_eval = ggml_debug;
|
||||||
params.cb_eval_user_data = &cb_data;
|
params.cb_eval_ctx = &cb_ctx;
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
|
@ -602,7 +602,7 @@ int main(int argc, char ** argv) {
|
|||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
params.cb_eval = ik_collect_imatrix;
|
params.cb_eval = ik_collect_imatrix;
|
||||||
params.cb_eval_user_data = NULL;
|
params.cb_eval_ctx = NULL;
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
|
@ -104,7 +104,7 @@ extern "C" {
|
|||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
@ -177,7 +177,7 @@ extern "C" {
|
|||||||
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||||
// if the user returns false, the scheduler will cancel the graph compute
|
// if the user returns false, the scheduler will cancel the graph compute
|
||||||
//
|
//
|
||||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * cb_ctx);
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
// Initialize a backend scheduler
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||||
@ -208,7 +208,7 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Set a callback to be called for each resulting node during graph compute
|
// Set a callback to be called for each resulting node during graph compute
|
||||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Utils
|
// Utils
|
||||||
@ -225,10 +225,10 @@ extern "C" {
|
|||||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||||
|
|
||||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * cb_ctx);
|
||||||
|
|
||||||
// Compare the output of two backends
|
// Compare the output of two backends
|
||||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback cb_eval, void * cb_eval_ctx);
|
||||||
|
|
||||||
// Tensor initialization
|
// Tensor initialization
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
|
@ -40,7 +40,7 @@ extern "C" {
|
|||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
|
|||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
|
@ -620,7 +620,7 @@ extern "C" {
|
|||||||
// Abort callback
|
// Abort callback
|
||||||
// If not NULL, called before ggml computation
|
// If not NULL, called before ggml computation
|
||||||
// If it returns true, the computation is aborted
|
// If it returns true, the computation is aborted
|
||||||
typedef bool (*ggml_abort_callback)(void * data);
|
typedef bool (*ggml_abort_callback)(void * cb_ctx);
|
||||||
|
|
||||||
// Scheduling priorities
|
// Scheduling priorities
|
||||||
enum ggml_sched_priority {
|
enum ggml_sched_priority {
|
||||||
@ -655,8 +655,8 @@ extern "C" {
|
|||||||
struct ggml_threadpool * threadpool;
|
struct ggml_threadpool * threadpool;
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback cb_abort;
|
||||||
void * abort_callback_data;
|
void * cb_abort_ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
@ -2143,8 +2143,8 @@ extern "C" {
|
|||||||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
typedef void (*ggml_opt_callback)(void * cb_ctx, int accum_step, float * sched, bool * cancel);
|
||||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * cb_ctx);
|
||||||
|
|
||||||
// optimization parameters
|
// optimization parameters
|
||||||
//
|
//
|
||||||
@ -2281,8 +2281,8 @@ extern "C" {
|
|||||||
struct ggml_tensor * f,
|
struct ggml_tensor * f,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback cb_opt,
|
||||||
void * callback_data);
|
void * cb_opt_ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// tensor flags
|
// tensor flags
|
||||||
|
@ -728,8 +728,8 @@ struct ggml_backend_cpu_context {
|
|||||||
void * work_data;
|
void * work_data;
|
||||||
size_t work_size;
|
size_t work_size;
|
||||||
|
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback cb_abort;
|
||||||
void * abort_callback_data;
|
void * cb_abort_ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||||
@ -772,8 +772,8 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
cpu_plan->cplan.cb_abort = cpu_ctx->cb_abort;
|
||||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cpu_plan->cplan.cb_abort_ctx = cpu_ctx->cb_abort_ctx;
|
||||||
|
|
||||||
return cpu_plan;
|
return cpu_plan;
|
||||||
}
|
}
|
||||||
@ -811,8 +811,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|||||||
}
|
}
|
||||||
cplan.work_data = cpu_ctx->work_data;
|
cplan.work_data = cpu_ctx->work_data;
|
||||||
|
|
||||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
cplan.cb_abort = cpu_ctx->cb_abort;
|
||||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cplan.cb_abort_ctx = cpu_ctx->cb_abort_ctx;
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
@ -878,12 +878,12 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
ctx->threadpool = NULL;
|
ctx->threadpool = NULL;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
ctx->abort_callback = NULL;
|
ctx->cb_abort = NULL;
|
||||||
ctx->abort_callback_data = NULL;
|
ctx->cb_abort_ctx = NULL;
|
||||||
|
|
||||||
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||||||
if (cpu_backend == NULL) {
|
if (cpu_backend == NULL) {
|
||||||
@ -922,12 +922,12 @@ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool
|
|||||||
ctx->threadpool = threadpool;
|
ctx->threadpool = threadpool;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback cb, void * cb_ctx) {
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
ctx->abort_callback = abort_callback;
|
ctx->cb_abort = cb;
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
ctx->cb_abort_ctx = cb_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
@ -1093,8 +1093,8 @@ struct ggml_backend_sched {
|
|||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback callback_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * callback_eval_user_data;
|
void * cb_eval_ctx;
|
||||||
|
|
||||||
char * context_buffer;
|
char * context_buffer;
|
||||||
size_t context_buffer_size;
|
size_t context_buffer_size;
|
||||||
@ -1814,7 +1814,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!sched->callback_eval) {
|
if (!sched->cb_eval) {
|
||||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||||
if (ec != GGML_STATUS_SUCCESS) {
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
return ec;
|
return ec;
|
||||||
@ -1825,14 +1825,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||||||
struct ggml_tensor * t = split->graph.nodes[j0];
|
struct ggml_tensor * t = split->graph.nodes[j0];
|
||||||
|
|
||||||
// check if the user needs data from this node
|
// check if the user needs data from this node
|
||||||
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
bool need = sched->cb_eval(t, true, sched->cb_eval_ctx);
|
||||||
|
|
||||||
int j1 = j0;
|
int j1 = j0;
|
||||||
|
|
||||||
// determine the range [j0, j1] of nodes that can be computed together
|
// determine the range [j0, j1] of nodes that can be computed together
|
||||||
while (!need && j1 < split->graph.n_nodes - 1) {
|
while (!need && j1 < split->graph.n_nodes - 1) {
|
||||||
t = split->graph.nodes[++j1];
|
t = split->graph.nodes[++j1];
|
||||||
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
need = sched->cb_eval(t, true, sched->cb_eval_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||||
@ -1845,7 +1845,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||||||
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
|
|
||||||
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
if (need && !sched->cb_eval(t, false, sched->cb_eval_ctx)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2012,9 +2012,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback cb, void * cb_ctx) {
|
||||||
sched->callback_eval = callback;
|
sched->cb_eval = cb;
|
||||||
sched->callback_eval_user_data = user_data;
|
sched->cb_eval_ctx = cb_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||||
@ -2229,7 +2229,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|||||||
ggml_free(copy.ctx_unallocated);
|
ggml_free(copy.ctx_unallocated);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback cb_eval, void * cb_eval_ctx) {
|
||||||
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
||||||
if (copy.buffer == NULL) {
|
if (copy.buffer == NULL) {
|
||||||
return false;
|
return false;
|
||||||
@ -2258,7 +2258,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
// compare results, calculate rms etc
|
// compare results, calculate rms etc
|
||||||
if (!callback(i, t1, t2, user_data)) {
|
if (!cb_eval(i, t1, t2, cb_eval_ctx)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -236,8 +236,8 @@ struct ggml_backend_metal_context {
|
|||||||
bool should_capture_next_compute;
|
bool should_capture_next_compute;
|
||||||
|
|
||||||
// abort ggml_metal_graph_compute if callback returns true
|
// abort ggml_metal_graph_compute if callback returns true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback cb_abort;
|
||||||
void * abort_callback_data;
|
void * cb_abort_ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
// MSL code
|
// MSL code
|
||||||
@ -251,32 +251,32 @@ struct ggml_backend_metal_context {
|
|||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * cb_ctx) {
|
||||||
fprintf(stderr, "%s", msg);
|
fprintf(stderr, "%s", msg);
|
||||||
|
|
||||||
UNUSED(level);
|
UNUSED(level);
|
||||||
UNUSED(user_data);
|
UNUSED(cb_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
static ggml_log_callback ggml_metal_log_cb = ggml_metal_default_log_callback;
|
||||||
void * ggml_metal_log_user_data = NULL;
|
static void * ggml_metal_log_cb_ctx = NULL;
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||||
if (ggml_metal_log_callback != NULL) {
|
if (ggml_metal_log_cb != NULL) {
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, format);
|
va_start(args, format);
|
||||||
char buffer[128];
|
char buffer[128];
|
||||||
int len = vsnprintf(buffer, 128, format, args);
|
int len = vsnprintf(buffer, 128, format, args);
|
||||||
if (len < 128) {
|
if (len < 128) {
|
||||||
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
ggml_metal_log_cb(level, buffer, ggml_metal_log_cb_ctx);
|
||||||
} else {
|
} else {
|
||||||
char* buffer2 = malloc(len+1);
|
char* buffer2 = malloc(len+1);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
va_start(args, format);
|
va_start(args, format);
|
||||||
vsnprintf(buffer2, len+1, format, args);
|
vsnprintf(buffer2, len+1, format, args);
|
||||||
buffer2[len] = 0;
|
buffer2[len] = 0;
|
||||||
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
ggml_metal_log_cb(level, buffer2, ggml_metal_log_cb_ctx);
|
||||||
free(buffer2);
|
free(buffer2);
|
||||||
}
|
}
|
||||||
va_end(args);
|
va_end(args);
|
||||||
@ -910,7 +910,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// always enqueue the first two command buffers
|
// always enqueue the first two command buffers
|
||||||
// enqueue all of the command buffers if we don't need to abort
|
// enqueue all of the command buffers if we don't need to abort
|
||||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
if (cb_idx < 2 || ctx->cb_abort == NULL) {
|
||||||
[command_buffer enqueue];
|
[command_buffer enqueue];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3026,7 +3026,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
[encoder endEncoding];
|
[encoder endEncoding];
|
||||||
|
|
||||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
if (cb_idx < 2 || ctx->cb_abort == NULL) {
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -3058,7 +3058,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
if (ctx->cb_abort && ctx->cb_abort(ctx->cb_abort_ctx)) {
|
||||||
GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||||
return GGML_STATUS_ABORTED;
|
return GGML_STATUS_ABORTED;
|
||||||
}
|
}
|
||||||
@ -3225,19 +3225,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
|||||||
ctx->n_buffers = 1;
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
if (ctx->all_data != NULL) {
|
if (ctx->all_data != NULL) {
|
||||||
ctx->buffers[0].data = ctx->all_data;
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
ctx->buffers[0].size = size;
|
ctx->buffers[0].size = size;
|
||||||
ctx->buffers[0].metal = nil;
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
|
length:size_aligned
|
||||||
if (size_aligned > 0) {
|
options:MTLResourceStorageModeShared
|
||||||
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
deallocator:nil];
|
||||||
length:size_aligned
|
|
||||||
options:MTLResourceStorageModeShared
|
|
||||||
deallocator:nil];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
free(ctx);
|
free(ctx);
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
@ -3314,17 +3310,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
|||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
if (size_aligned <= device.maxBufferLength) {
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
ctx->buffers[ctx->n_buffers].data = data;
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
|
||||||
|
|
||||||
if (size_aligned > 0) {
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
||||||
@ -3340,17 +3333,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
|||||||
for (size_t i = 0; i < size; i += size_step) {
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
|
||||||
|
|
||||||
if (size_step_aligned > 0) {
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
||||||
@ -3427,9 +3417,9 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|||||||
/* .event_synchronize = */ NULL,
|
/* .event_synchronize = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
void ggml_backend_metal_log_set_callback(ggml_log_callback cb, void * cb_ctx) {
|
||||||
ggml_metal_log_callback = log_callback;
|
ggml_metal_log_cb = cb;
|
||||||
ggml_metal_log_user_data = user_data;
|
ggml_metal_log_cb_ctx = cb_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_metal_guid(void) {
|
static ggml_guid_t ggml_backend_metal_guid(void) {
|
||||||
@ -3467,13 +3457,13 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
|
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback cb, void * cb_ctx) {
|
||||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
ctx->abort_callback = abort_callback;
|
ctx->cb_abort = cb;
|
||||||
ctx->abort_callback_data = user_data;
|
ctx->cb_abort_ctx = cb_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||||
@ -3491,11 +3481,11 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
|||||||
ctx->should_capture_next_compute = true;
|
ctx->should_capture_next_compute = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * cb_ctx); // silence warning
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * cb_ctx) {
|
||||||
return ggml_backend_metal_init();
|
return ggml_backend_metal_init();
|
||||||
|
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
GGML_UNUSED(user_data);
|
GGML_UNUSED(cb_ctx);
|
||||||
}
|
}
|
||||||
|
@ -184,7 +184,7 @@ struct backtrace_state {
|
|||||||
void ** end;
|
void ** end;
|
||||||
};
|
};
|
||||||
|
|
||||||
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
|
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context * context, void * arg) {
|
||||||
struct backtrace_state * state = (struct backtrace_state *)arg;
|
struct backtrace_state * state = (struct backtrace_state *)arg;
|
||||||
uintptr_t pc = _Unwind_GetIP(context);
|
uintptr_t pc = _Unwind_GetIP(context);
|
||||||
if (pc) {
|
if (pc) {
|
||||||
@ -19951,7 +19951,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||||||
|
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
|
|
||||||
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
if (state->ith == 0 && cplan->cb_abort && cplan->cb_abort(cplan->cb_abort_ctx)) {
|
||||||
state->threadpool->ec = GGML_STATUS_ABORTED;
|
state->threadpool->ec = GGML_STATUS_ABORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -21011,8 +21011,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||||||
struct ggml_tensor * f,
|
struct ggml_tensor * f,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback cb_opt,
|
||||||
void * callback_data) {
|
void * cb_opt_ctx) {
|
||||||
GGML_ASSERT(ggml_is_scalar(f));
|
GGML_ASSERT(ggml_is_scalar(f));
|
||||||
GGML_ASSERT(f->type == GGML_TYPE_F32);
|
GGML_ASSERT(f->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
@ -21066,8 +21066,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||||||
float fx = 0;
|
float fx = 0;
|
||||||
ggml_set_zero(opt->adam.g);
|
ggml_set_zero(opt->adam.g);
|
||||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||||
if (callback) {
|
if (cb_opt) {
|
||||||
callback(callback_data, accum_step, &sched, &cancel);
|
cb_opt(cb_opt_ctx, accum_step, &sched, &cancel);
|
||||||
if (cancel) {
|
if (cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;
|
return GGML_OPT_RESULT_CANCEL;
|
||||||
}
|
}
|
||||||
@ -21157,8 +21157,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||||||
fx = 0;
|
fx = 0;
|
||||||
ggml_set_zero(opt->adam.g);
|
ggml_set_zero(opt->adam.g);
|
||||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||||
if (callback) {
|
if (cb_opt) {
|
||||||
callback(callback_data, accum_step, &sched, &cancel);
|
cb_opt(cb_opt_ctx, accum_step, &sched, &cancel);
|
||||||
if (cancel) {
|
if (cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;;
|
return GGML_OPT_RESULT_CANCEL;;
|
||||||
}
|
}
|
||||||
@ -21254,8 +21254,8 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||||||
const int np,
|
const int np,
|
||||||
struct ggml_tensor * ps[],
|
struct ggml_tensor * ps[],
|
||||||
bool * cancel,
|
bool * cancel,
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback cb_opt,
|
||||||
void * callback_data) {
|
void * cb_opt_ctx) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
float width = 0.0f;
|
float width = 0.0f;
|
||||||
@ -21297,10 +21297,10 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||||||
*fx = 0;
|
*fx = 0;
|
||||||
memset(g, 0, sizeof(float)*nx);
|
memset(g, 0, sizeof(float)*nx);
|
||||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||||
if (callback) {
|
if (cb_opt) {
|
||||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||||
float sched = 0;
|
float sched = 0;
|
||||||
callback(callback_data, accum_step, &sched, cancel);
|
cb_opt(cb_opt_ctx, accum_step, &sched, cancel);
|
||||||
if (*cancel) {
|
if (*cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;
|
return GGML_OPT_RESULT_CANCEL;
|
||||||
}
|
}
|
||||||
@ -21370,8 +21370,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||||||
struct ggml_tensor * f,
|
struct ggml_tensor * f,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback cb_opt,
|
||||||
void * callback_data) {
|
void * cb_opt_ctx) {
|
||||||
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
||||||
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
||||||
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
||||||
@ -21440,10 +21440,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||||||
fx = 0;
|
fx = 0;
|
||||||
memset(g, 0, sizeof(float)*nx);
|
memset(g, 0, sizeof(float)*nx);
|
||||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||||
if (callback) {
|
if (cb_opt) {
|
||||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||||
float sched = 0;
|
float sched = 0;
|
||||||
callback(callback_data, accum_step, &sched, &cancel);
|
cb_opt(cb_opt_ctx, accum_step, &sched, &cancel);
|
||||||
if (cancel) {
|
if (cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;
|
return GGML_OPT_RESULT_CANCEL;
|
||||||
}
|
}
|
||||||
@ -21516,7 +21516,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||||||
// to determine if the optimization should be cancelled
|
// to determine if the optimization should be cancelled
|
||||||
// this is a simple change, but not doing this atm, since I don't have a nice
|
// this is a simple change, but not doing this atm, since I don't have a nice
|
||||||
// way to test and don't want to break something with so many changes lined up
|
// way to test and don't want to break something with so many changes lined up
|
||||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, cb_opt, cb_opt_ctx);
|
||||||
if (cancel) {
|
if (cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;
|
return GGML_OPT_RESULT_CANCEL;
|
||||||
}
|
}
|
||||||
@ -21834,8 +21834,8 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||||||
struct ggml_tensor * f,
|
struct ggml_tensor * f,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback cb_opt,
|
||||||
void * callback_data) {
|
void * cb_opt_ctx) {
|
||||||
|
|
||||||
GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor");
|
GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor");
|
||||||
|
|
||||||
@ -21845,11 +21845,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||||||
switch (opt->params.type) {
|
switch (opt->params.type) {
|
||||||
case GGML_OPT_TYPE_ADAM:
|
case GGML_OPT_TYPE_ADAM:
|
||||||
{
|
{
|
||||||
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, cb_opt, cb_opt_ctx);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OPT_TYPE_LBFGS:
|
case GGML_OPT_TYPE_LBFGS:
|
||||||
{
|
{
|
||||||
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, cb_opt, cb_opt_ctx);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,7 +221,7 @@ extern "C" {
|
|||||||
bool sorted;
|
bool sorted;
|
||||||
} llama_token_data_array;
|
} llama_token_data_array;
|
||||||
|
|
||||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
typedef bool (*llama_progress_callback)(float progress, void * cb_ctx);
|
||||||
|
|
||||||
// Input data for llama_decode
|
// Input data for llama_decode
|
||||||
// A llama_batch object can contain input about one or many sequences
|
// A llama_batch object can contain input about one or many sequences
|
||||||
@ -290,12 +290,10 @@ extern "C" {
|
|||||||
const char * rpc_servers;
|
const char * rpc_servers;
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
// If the provided progress_callback returns true, model loading continues.
|
// If the provided cb_progress returns true, model loading continues.
|
||||||
// If it returns false, model loading is immediately aborted.
|
// If it returns false, model loading is immediately aborted.
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback cb_progress;
|
||||||
|
void * cb_progress_ctx;
|
||||||
// context pointer passed to the progress callback
|
|
||||||
void * progress_callback_user_data;
|
|
||||||
|
|
||||||
// override key-value pairs of the model meta data
|
// override key-value pairs of the model meta data
|
||||||
const struct llama_model_kv_override * kv_overrides;
|
const struct llama_model_kv_override * kv_overrides;
|
||||||
@ -331,25 +329,24 @@ extern "C" {
|
|||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
|
||||||
void * cb_eval_user_data;
|
|
||||||
|
|
||||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
// TODO: move at the end of the struct
|
void * cb_eval_ctx;
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // whether to measure performance timings
|
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
// currently works only with CPU execution
|
// currently works only with CPU execution
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback cb_abort;
|
||||||
void * abort_callback_data;
|
void * cb_abort_ctx;
|
||||||
|
|
||||||
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool flash_attn; // enable flash attention [EXPERIMENTAL]
|
||||||
|
bool no_perf; // disable performance timings
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -373,7 +370,7 @@ extern "C" {
|
|||||||
} llama_logit_bias;
|
} llama_logit_bias;
|
||||||
|
|
||||||
typedef struct llama_sampler_chain_params {
|
typedef struct llama_sampler_chain_params {
|
||||||
bool no_perf; // whether to measure performance timings
|
bool no_perf; // disable performance timings
|
||||||
} llama_sampler_chain_params;
|
} llama_sampler_chain_params;
|
||||||
|
|
||||||
// used in chat template
|
// used in chat template
|
||||||
@ -833,7 +830,7 @@ extern "C" {
|
|||||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
|
|
||||||
// Set abort callback
|
// Set abort callback
|
||||||
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
// Wait until all computations are finished
|
// Wait until all computations are finished
|
||||||
// This is automatically done when using one of the functions below to obtain the computation results
|
// This is automatically done when using one of the functions below to obtain the computation results
|
||||||
@ -1168,7 +1165,7 @@ extern "C" {
|
|||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback cb, void * cb_ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Performance utils
|
// Performance utils
|
||||||
|
104
src/llama.cpp
104
src/llama.cpp
@ -2183,17 +2183,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||||||
struct llama_state {
|
struct llama_state {
|
||||||
llama_state() {
|
llama_state() {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
ggml_backend_metal_log_set_callback(cb_log, cb_log_ctx);
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
ggml_backend_cuda_log_set_callback(cb_log, cb_log_ctx);
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
|
ggml_backend_cann_log_set_callback(cb_log, cb_log_ctx);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// We save the log callback globally
|
// We save the log callback globally
|
||||||
ggml_log_callback log_callback = llama_log_callback_default;
|
ggml_log_callback cb_log = llama_log_callback_default;
|
||||||
void * log_callback_user_data = nullptr;
|
void * cb_log_ctx = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static llama_state g_state;
|
static llama_state g_state;
|
||||||
@ -2491,7 +2491,7 @@ struct llama_cparams {
|
|||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
|
// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
|
||||||
@ -3263,8 +3263,8 @@ struct llama_context {
|
|||||||
std::vector<uint8_t> buf_compute_meta;
|
std::vector<uint8_t> buf_compute_meta;
|
||||||
ggml_backend_sched_t sched = nullptr;
|
ggml_backend_sched_t sched = nullptr;
|
||||||
|
|
||||||
ggml_abort_callback abort_callback = nullptr;
|
ggml_abort_callback cb_abort = nullptr;
|
||||||
void * abort_callback_data = nullptr;
|
void * cb_abort_ctx = nullptr;
|
||||||
|
|
||||||
// input tensors
|
// input tensors
|
||||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||||
@ -4901,13 +4901,13 @@ struct llama_model_loader {
|
|||||||
size_t size_data = 0;
|
size_t size_data = 0;
|
||||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
// Returns false if cancelled by cb_progress
|
||||||
bool load_all_data(
|
bool load_all_data(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
llama_buf_map & bufs_mmap,
|
llama_buf_map & bufs_mmap,
|
||||||
llama_mlocks * lmlocks,
|
llama_mlocks * lmlocks,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback cb_progress,
|
||||||
void * progress_callback_user_data) {
|
void * cb_progress_ctx) {
|
||||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||||
|
|
||||||
std::vector<no_init<uint8_t>> read_buf;
|
std::vector<no_init<uint8_t>> read_buf;
|
||||||
@ -4958,8 +4958,8 @@ struct llama_model_loader {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (progress_callback) {
|
if (cb_progress) {
|
||||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
if (!cb_progress((float) size_done / size_data, cb_progress_ctx)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5081,10 +5081,10 @@ struct llama_model_loader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (progress_callback) {
|
if (cb_progress) {
|
||||||
// Even though the model is done loading, we still honor
|
// Even though the model is done loading, we still honor
|
||||||
// cancellation since we need to free allocations.
|
// cancellation since we need to free allocations.
|
||||||
return progress_callback(1.0f, progress_callback_user_data);
|
return cb_progress(1.0f, cb_progress_ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6651,7 +6651,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
// Returns false if cancelled by cb_progress
|
||||||
static bool llm_load_tensors(
|
static bool llm_load_tensors(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
@ -6660,8 +6660,8 @@ static bool llm_load_tensors(
|
|||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback cb_progress,
|
||||||
void * progress_callback_user_data) {
|
void * cb_progress_ctx) {
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
@ -8581,7 +8581,7 @@ static bool llm_load_tensors(
|
|||||||
for (auto & it : ctx_bufs) {
|
for (auto & it : ctx_bufs) {
|
||||||
ggml_context * ctx = it.first;
|
ggml_context * ctx = it.first;
|
||||||
auto & bufs = it.second;
|
auto & bufs = it.second;
|
||||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, cb_progress, cb_progress_ctx)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8595,7 +8595,7 @@ static bool llm_load_tensors(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_cb_progress
|
||||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||||
model.t_start_us = ggml_time_us();
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
@ -8651,7 +8651,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||||||
|
|
||||||
if (!llm_load_tensors(
|
if (!llm_load_tensors(
|
||||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||||
params.progress_callback, params.progress_callback_user_data
|
params.cb_progress, params.cb_progress_ctx
|
||||||
)) {
|
)) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
@ -16046,9 +16046,9 @@ static void llama_graph_compute(
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
ggml_backend_cpu_set_n_threads (lctx.backend_cpu, n_threads);
|
||||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
ggml_backend_cpu_set_threadpool (lctx.backend_cpu, threadpool);
|
||||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.cb_abort, lctx.cb_abort_ctx);
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
if (lctx.backend_blas != nullptr) {
|
if (lctx.backend_blas != nullptr) {
|
||||||
@ -16208,7 +16208,7 @@ static int llama_decode_internal(
|
|||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched);
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_ctx);
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
@ -16432,7 +16432,7 @@ static int llama_encode_internal(
|
|||||||
GGML_ASSERT(n_threads > 0);
|
GGML_ASSERT(n_threads > 0);
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched);
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_ctx);
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
@ -17907,8 +17907,8 @@ struct llama_model_params llama_model_default_params() {
|
|||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
/*.tensor_split =*/ nullptr,
|
||||||
/*.rpc_servers =*/ nullptr,
|
/*.rpc_servers =*/ nullptr,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.cb_progress =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.cb_progress_ctx =*/ nullptr,
|
||||||
/*.kv_overrides =*/ nullptr,
|
/*.kv_overrides =*/ nullptr,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
@ -17943,17 +17943,17 @@ struct llama_context_params llama_context_default_params() {
|
|||||||
/*.yarn_beta_slow =*/ 1.0f,
|
/*.yarn_beta_slow =*/ 1.0f,
|
||||||
/*.yarn_orig_ctx =*/ 0,
|
/*.yarn_orig_ctx =*/ 0,
|
||||||
/*.defrag_thold =*/ -1.0f,
|
/*.defrag_thold =*/ -1.0f,
|
||||||
/*.cb_eval =*/ nullptr,
|
|
||||||
/*.cb_eval_user_data =*/ nullptr,
|
|
||||||
/*.type_k =*/ GGML_TYPE_F16,
|
/*.type_k =*/ GGML_TYPE_F16,
|
||||||
/*.type_v =*/ GGML_TYPE_F16,
|
/*.type_v =*/ GGML_TYPE_F16,
|
||||||
|
/*.cb_eval =*/ nullptr,
|
||||||
|
/*.cb_eval_ctx =*/ nullptr,
|
||||||
|
/*.cb_abort =*/ nullptr,
|
||||||
|
/*.cb_abort_ctx =*/ nullptr,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
/*.abort_callback =*/ nullptr,
|
|
||||||
/*.abort_callback_data =*/ nullptr,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -18067,9 +18067,9 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
llama_model * model = new llama_model;
|
llama_model * model = new llama_model;
|
||||||
|
|
||||||
unsigned cur_percentage = 0;
|
unsigned cur_percentage = 0;
|
||||||
if (params.progress_callback == NULL) {
|
if (params.cb_progress == NULL) {
|
||||||
params.progress_callback_user_data = &cur_percentage;
|
params.cb_progress_ctx = &cur_percentage;
|
||||||
params.progress_callback = [](float progress, void * ctx) {
|
params.cb_progress = [](float progress, void * ctx) {
|
||||||
unsigned * cur_percentage_p = (unsigned *) ctx;
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
||||||
unsigned percentage = (unsigned) (100 * progress);
|
unsigned percentage = (unsigned) (100 * progress);
|
||||||
while (percentage > *cur_percentage_p) {
|
while (percentage > *cur_percentage_p) {
|
||||||
@ -18189,8 +18189,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
||||||
hparams.n_ctx_train;
|
hparams.n_ctx_train;
|
||||||
|
|
||||||
cparams.cb_eval = params.cb_eval;
|
cparams.cb_eval = params.cb_eval;
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_ctx = params.cb_eval_ctx;
|
||||||
|
|
||||||
auto rope_scaling_type = params.rope_scaling_type;
|
auto rope_scaling_type = params.rope_scaling_type;
|
||||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
||||||
@ -18228,8 +18228,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||||
|
|
||||||
ctx->abort_callback = params.abort_callback;
|
ctx->cb_abort = params.cb_abort;
|
||||||
ctx->abort_callback_data = params.abort_callback_data;
|
ctx->cb_abort_ctx = params.cb_abort_ctx;
|
||||||
|
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
@ -19971,9 +19971,9 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|||||||
return ctx->cparams.n_threads_batch;
|
return ctx->cparams.n_threads_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*cb)(void * data), void * cb_ctx) {
|
||||||
ctx->abort_callback = abort_callback;
|
ctx->cb_abort = cb;
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
ctx->cb_abort_ctx = cb_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||||
@ -20761,15 +20761,15 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|||||||
return ctx->model.tensors_by_name;
|
return ctx->model.tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback cb, void * cb_ctx) {
|
||||||
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_state.cb_log = cb ? cb : llama_log_callback_default;
|
||||||
g_state.log_callback_user_data = user_data;
|
g_state.cb_log_ctx = cb_ctx;
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
ggml_backend_metal_log_set_callback(g_state.cb_log, g_state.cb_log_ctx);
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
ggml_backend_cuda_log_set_callback(g_state.cb_log, g_state.cb_log_ctx);
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
ggml_backend_cann_log_set_callback(g_state.cb_log, g_state.cb_log_ctx);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -20779,12 +20779,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
|
|||||||
char buffer[128];
|
char buffer[128];
|
||||||
int len = vsnprintf(buffer, 128, format, args);
|
int len = vsnprintf(buffer, 128, format, args);
|
||||||
if (len < 128) {
|
if (len < 128) {
|
||||||
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
g_state.cb_log(level, buffer, g_state.cb_log_ctx);
|
||||||
} else {
|
} else {
|
||||||
char * buffer2 = new char[len + 1];
|
char * buffer2 = new char[len + 1];
|
||||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||||
buffer2[len] = 0;
|
buffer2[len] = 0;
|
||||||
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
g_state.cb_log(level, buffer2, g_state.cb_log_ctx);
|
||||||
delete[] buffer2;
|
delete[] buffer2;
|
||||||
}
|
}
|
||||||
va_end(args_copy);
|
va_end(args_copy);
|
||||||
@ -20797,9 +20797,9 @@ void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
|||||||
va_end(args);
|
va_end(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
void llama_log_callback_default(ggml_log_level level, const char * text, void * cb_ctx) {
|
||||||
(void) level;
|
(void) level;
|
||||||
(void) user_data;
|
(void) cb_ctx;
|
||||||
fputs(text, stderr);
|
fputs(text, stderr);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ int main(int argc, char *argv[] ) {
|
|||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
auto params = llama_model_params{};
|
auto params = llama_model_params{};
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
params.progress_callback = [](float progress, void * ctx){
|
params.cb_progress = [](float progress, void * ctx){
|
||||||
(void) ctx;
|
(void) ctx;
|
||||||
return progress > 0.50;
|
return progress > 0.50;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user