mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
llama : llama_perf + option to disable timings during decode (#9355)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
bd35cb0ae3
commit
0abc6a2c25
@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.prompt = value;
|
params.prompt = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--no-perf"},
|
||||||
|
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
|
[](gpt_params & params) {
|
||||||
|
params.no_perf = true;
|
||||||
|
params.sparams.no_perf = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_NO_PERF"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-f", "--file"}, "FNAME",
|
{"-f", "--file"}, "FNAME",
|
||||||
"a file containing the prompt (default: none)",
|
"a file containing the prompt (default: none)",
|
||||||
|
@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||||||
}
|
}
|
||||||
llama_kv_cache_clear(lctx);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_synchronize(lctx);
|
llama_synchronize(lctx);
|
||||||
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_reset(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model = model;
|
||||||
@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||||
|
@ -124,6 +124,7 @@ struct gpt_sampler_params {
|
|||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
@ -246,6 +247,7 @@ struct gpt_params {
|
|||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
|
@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
|||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = false; // TODO: control via params
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
auto * result = new gpt_sampler {
|
auto * result = new gpt_sampler {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|||||||
// TODO: measure grammar performance
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(gsmpl->chain);
|
||||||
}
|
}
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
|
|||||||
|
|
||||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||||
|
|
||||||
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
llama_perf_sampler_print(smpl)
|
||||||
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
llama_perf_context_print(context)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let utf8Count = text.utf8.count
|
let utf8Count = text.utf8.count
|
||||||
|
@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
|
|||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
|
|||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
|
|||||||
fflush(p_err->fout);
|
fflush(p_err->fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
|
|||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
|
|||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -240,8 +240,7 @@ int main(int argc, char ** argv){
|
|||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
gpt_perf_print(ctx, smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
|
@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
// TODO: print sampling/grammar timings for all clients
|
// TODO: print sampling/grammar timings for all clients
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
|
|||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
write_logfile(ctx, params, model, results);
|
write_logfile(ctx, params, model, results);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
|
@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
|
|||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
LOG_TEE("\ndraft:\n\n");
|
LOG_TEE("\ndraft:\n\n");
|
||||||
// TODO: print sampling/grammar timings for all drafts
|
// TODO: print sampling/grammar timings for all drafts
|
||||||
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_dft);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
gpt_perf_print(ctx_tgt, smpl);
|
gpt_perf_print(ctx_tgt, smpl);
|
||||||
|
@ -343,7 +343,7 @@ extern "C" {
|
|||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
//bool no_perf; // whether to measure performance timings, TODO: implement
|
bool no_perf; // whether to measure performance timings
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
@ -1176,13 +1176,30 @@ extern "C" {
|
|||||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_perf_type {
|
struct llama_perf_context_data {
|
||||||
LLAMA_PERF_TYPE_CONTEXT = 0,
|
double t_start_ms;
|
||||||
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
double t_load_ms;
|
||||||
|
double t_p_eval_ms;
|
||||||
|
double t_eval_ms;
|
||||||
|
|
||||||
|
int32_t n_p_eval;
|
||||||
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
struct llama_perf_sampler_data {
|
||||||
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
double t_sample_ms;
|
||||||
|
|
||||||
|
int32_t n_sample;
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||||
|
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
||||||
|
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
||||||
|
|
||||||
|
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
||||||
|
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
||||||
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
|
@ -1669,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
|||||||
|
|
||||||
return LLAMA_DEFAULT_SEED;
|
return LLAMA_DEFAULT_SEED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// perf
|
||||||
|
|
||||||
|
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
||||||
|
struct llama_perf_sampler_data data = {};
|
||||||
|
|
||||||
|
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||||
|
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
|
||||||
|
data.n_sample = std::max(0, ctx->n_sample);
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
||||||
|
const auto data = llama_perf_sampler(chain);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||||
|
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||||
|
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
ctx->t_sample_us = ctx->n_sample = 0;
|
||||||
|
}
|
||||||
|
@ -2486,6 +2486,7 @@ struct llama_cparams {
|
|||||||
bool causal_attn;
|
bool causal_attn;
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
|
bool no_perf;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
@ -6661,8 +6662,6 @@ static bool llm_load_tensors(
|
|||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
model.t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
@ -8593,14 +8592,13 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
|
||||||
// we take page faults deferred by mmap() into consideration
|
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||||
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||||
|
|
||||||
@ -8662,6 +8660,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loading time will be recalculate after the first eval, so
|
||||||
|
// we take page faults deferred by mmap() into consideration
|
||||||
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -17949,6 +17951,7 @@ struct llama_context_params llama_context_default_params() {
|
|||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
|
/*.no_perf =*/ true,
|
||||||
/*.abort_callback =*/ nullptr,
|
/*.abort_callback =*/ nullptr,
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
};
|
};
|
||||||
@ -18159,6 +18162,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
cparams.embeddings = params.embeddings;
|
cparams.embeddings = params.embeddings;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.no_perf = params.no_perf;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||||
@ -20077,10 +20081,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
|||||||
|
|
||||||
// add the evaluation to the stats
|
// add the evaluation to the stats
|
||||||
if (ctx->n_queued_tokens == 1) {
|
if (ctx->n_queued_tokens == 1) {
|
||||||
|
if (!ctx->cparams.no_perf) {
|
||||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||||
|
}
|
||||||
ctx->n_eval++;
|
ctx->n_eval++;
|
||||||
} else if (ctx->n_queued_tokens > 1) {
|
} else if (ctx->n_queued_tokens > 1) {
|
||||||
|
if (!ctx->cparams.no_perf) {
|
||||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||||
|
}
|
||||||
ctx->n_p_eval += ctx->n_queued_tokens;
|
ctx->n_p_eval += ctx->n_queued_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -20688,65 +20696,40 @@ const char * llama_print_system_info(void) {
|
|||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
||||||
switch (type) {
|
struct llama_perf_context_data data = {};
|
||||||
case LLAMA_PERF_TYPE_CONTEXT:
|
|
||||||
{
|
|
||||||
const auto * p = (const struct llama_context *) ctx;
|
|
||||||
|
|
||||||
const double t_start_ms = 1e-3 * p->t_start_us;
|
if (ctx == nullptr) {
|
||||||
const double t_end_ms = 1.00 * ggml_time_ms();
|
return data;
|
||||||
const double t_load_ms = 1e-3 * p->t_load_us;
|
}
|
||||||
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
|
||||||
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
|
||||||
|
|
||||||
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
||||||
const int32_t n_eval = std::max(1, p->n_eval);
|
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
||||||
|
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
||||||
|
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
||||||
|
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
||||||
|
data.n_eval = std::max(1, ctx->n_eval);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_context_print(const struct llama_context * ctx) {
|
||||||
|
const auto data = llama_perf_context(ctx);
|
||||||
|
|
||||||
|
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||||
} break;
|
|
||||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
|
||||||
{
|
|
||||||
const auto * smpl = (const struct llama_sampler *) ctx;
|
|
||||||
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
|
||||||
|
|
||||||
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
|
||||||
|
|
||||||
const int32_t n_sampler = std::max(0, p->n_sample);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
|
||||||
} break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("invalid perf type");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
void llama_perf_context_reset(struct llama_context * ctx) {
|
||||||
switch (type) {
|
ctx->t_start_us = ggml_time_us();
|
||||||
case LLAMA_PERF_TYPE_CONTEXT:
|
ctx->t_eval_us = ctx->n_eval = 0;
|
||||||
{
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||||
auto * p = (struct llama_context *) ctx;
|
|
||||||
|
|
||||||
p->t_start_us = ggml_time_us();
|
|
||||||
p->t_eval_us = p->n_eval = 0;
|
|
||||||
p->t_p_eval_us = p->n_p_eval = 0;
|
|
||||||
} break;
|
|
||||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
|
||||||
{
|
|
||||||
auto * smpl = (struct llama_sampler *) ctx;
|
|
||||||
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
|
||||||
|
|
||||||
p->t_sample_us = p->n_sample = 0;
|
|
||||||
} break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("invalid perf type");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||||
|
Loading…
Reference in New Issue
Block a user