Compare commits

...

11 Commits

Author SHA1 Message Date
Georgi Gerganov
36195c578e
Merge f42de2426e into 8db003a19d 2024-09-11 15:41:18 +03:00
Pavel Zloi
8db003a19d
py : support converting local models (#7547)
* Support of converting local models added to convert-hf-to-gguf-update.py

* Description fixed

* shutil added to imports
2024-09-11 15:29:51 +03:00
Xuan Son Nguyen
0996c5597f
llava : correct args for minicpmv-cli (#9429) 2024-09-11 12:59:13 +02:00
Xuan Son Nguyen
5bb2c5dbd2
files : remove accidentally added lora_test submodule (#9430) 2024-09-11 13:02:09 +03:00
Farbod Bijary
67155ab7f5
feat: Implements retrying logic for downloading models using --model-url flag (#9255)
* feat: Implements retrying logic for downloading models using --model-url flag

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* apply comments

* implements a retry function to avoid duplication

* fix editorconfig

* change function name

---------

Co-authored-by: farbod <farbod.bjary82@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-09-11 11:22:37 +02:00
Johannes Gäßler
5af118efda
CUDA: fix --split-mode row race condition (#9413) 2024-09-11 10:22:40 +02:00
Georgi Gerganov
f42de2426e
perf : separate functions in the API
ggml-ci
2024-09-11 09:56:41 +03:00
Georgi Gerganov
fd46535314
Update src/llama.cpp
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2024-09-10 13:03:54 +03:00
Georgi Gerganov
6cce78c2ed
Merge branch 'master' into gg/llama-perf
ggml-ci
2024-09-10 11:38:09 +03:00
Georgi Gerganov
ade52b6cc6
common : add llama_arg 2024-09-08 08:57:56 +03:00
Georgi Gerganov
471e7e1e59
llama : llama_perf + option to disable timings during decode
ggml-ci
2024-09-08 08:54:34 +03:00
25 changed files with 193 additions and 111 deletions

View File

@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.prompt = value;
}
));
add_opt(llama_arg(
{"--no-perf"},
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](gpt_params & params) {
params.no_perf = true;
params.sparams.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
add_opt(llama_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",

View File

@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
}
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_reset_context(lctx);
}
iparams.model = model;
@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@ -941,11 +942,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
@ -1049,9 +1076,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
@ -1126,11 +1152,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
};
// start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}

View File

@ -124,6 +124,7 @@ struct gpt_sampler_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
@ -246,6 +247,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch

View File

@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = false; // TODO: control via params
lparams.no_perf = params.no_perf;
auto * result = new gpt_sampler {
/* .params = */ params,
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// TODO: measure grammar performance
if (gsmpl) {
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print_sampler(gsmpl->chain);
}
if (ctx) {
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
}
}

View File

@ -31,6 +31,7 @@ import re
import requests
import sys
import json
import shutil
from hashlib import sha256
from enum import IntEnum, auto
@ -125,6 +126,21 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model")
if os.path.isdir(repo):
# If repo is a path on the file system, copy the directory
for file in files:
src_path = os.path.join(repo, file)
dst_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):

View File

@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
llama_batch_free(batch);

View File

@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
llama_perf_print_sampler(smpl)
llama_perf_print_context(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count

View File

@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_sampler(smpl);
llama_perf_print_context(ctx);
fprintf(stderr, "\n");

View File

@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
// clean up
llama_batch_free(batch);

View File

@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
llama_free(ctx);
llama_free_model(model);

View File

@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix();
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
llama_free(ctx);
llama_free_model(model);

View File

@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
fflush(p_err->fout);
}
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
llama_free(ctx);

View File

@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);

View File

@ -18,8 +18,8 @@ struct llava_context {
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
}
}
printf("\n");
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);

View File

@ -240,8 +240,7 @@ int main(int argc, char ** argv){
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);

View File

@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\n");
// TODO: print sampling/grammar timings for all clients
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
llama_batch_free(batch);

View File

@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
fprintf(stderr, "\n");

View File

@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
write_logfile(ctx, params, model, results);
llama_free(ctx);

View File

@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
// clean up
llama_batch_free(query_batch);

View File

@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_sampler(smpl);
llama_perf_print_context(ctx);
fprintf(stderr, "\n");

View File

@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\ndraft:\n\n");
// TODO: print sampling/grammar timings for all drafts
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_dft);
LOG_TEE("\ntarget:\n\n");
gpt_perf_print(ctx_tgt, smpl);

View File

@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) {
case GGML_TYPE_Q4_0:

View File

@ -2742,6 +2742,7 @@ struct mmq_args {
int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne0;
bool use_stream_k;
};
template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
const dim3 block_nums_xy_tiling(nty, ntx, 1);
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
if (!use_stream_k) {
if (!args.use_stream_k) {
if (args.ne01 % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>

View File

@ -343,7 +343,7 @@ extern "C" {
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
//bool no_perf; // whether to measure performance timings, TODO: implement
bool no_perf; // whether to measure performance timings
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@ -1173,13 +1173,30 @@ extern "C" {
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//
enum llama_perf_type {
LLAMA_PERF_TYPE_CONTEXT = 0,
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
struct llama_perf_data_context {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_p_eval;
int32_t n_eval;
};
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
struct llama_perf_data_sampler {
double t_sample_ms;
int32_t n_sample;
};
LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);
LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);
LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

View File

@ -2482,6 +2482,7 @@ struct llama_cparams {
bool causal_attn;
bool offload_kqv;
bool flash_attn;
bool no_perf;
enum llama_pooling_type pooling_type;
@ -6657,8 +6658,6 @@ static bool llm_load_tensors(
bool use_mlock,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
auto & hparams = model.hparams;
model.split_mode = split_mode;
@ -8589,14 +8588,13 @@ static bool llm_load_tensors(
}
}
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
model.t_start_us = ggml_time_us();
try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
return -1;
}
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return 0;
}
@ -17941,6 +17943,7 @@ struct llama_context_params llama_context_default_params() {
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@ -18151,6 +18154,7 @@ struct llama_context * llama_new_context_with_model(
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@ -20069,10 +20073,14 @@ void llama_synchronize(struct llama_context * ctx) {
// add the evaluation to the stats
if (ctx->n_queued_tokens == 1) {
if (!ctx->cparams.no_perf) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_eval++;
} else if (ctx->n_queued_tokens > 1) {
if (!ctx->cparams.no_perf) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_p_eval += ctx->n_queued_tokens;
}
@ -20679,65 +20687,68 @@ const char * llama_print_system_info(void) {
return s.c_str();
}
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto * p = (const struct llama_context *) ctx;
struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) {
struct llama_perf_data_context data = {};
const double t_start_ms = 1e-3 * p->t_start_us;
const double t_end_ms = 1.00 * ggml_time_ms();
const double t_load_ms = 1e-3 * p->t_load_us;
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
const double t_eval_ms = 1e-3 * p->t_eval_us;
if (ctx == nullptr) {
return data;
}
const int32_t n_p_eval = std::max(0, p->n_p_eval);
const int32_t n_eval = std::max(1, p->n_eval);
data.t_start_ms = 1e-3 * ctx->t_start_us;
data.t_load_ms = 1e-3 * ctx->t_load_us;
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
data.n_p_eval = std::max(1, ctx->n_p_eval);
data.n_eval = std::max(1, ctx->n_eval);
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
return data;
}
struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) {
struct llama_perf_data_sampler data = {};
if (chain == nullptr) {
return data;
}
const auto * p = (const struct llama_sampler_chain *) chain->ctx;
data.t_sample_ms = 1e-3 * p->t_sample_us;
data.n_sample = std::max(0, p->n_sample);
return data;
}
void llama_perf_print_context(const struct llama_context * ctx) {
const auto data = llama_perf_context(ctx);
const double t_end_ms = 1e-3 * ggml_time_us();
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
}
const double t_sampler_ms = 1e-3 * p->t_sample_us;
const int32_t n_sampler = std::max(0, p->n_sample);
void llama_perf_print_sampler(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain);
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
} break;
default:
GGML_ABORT("invalid perf type");
}
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
}
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
auto * p = (struct llama_context *) ctx;
void llama_perf_reset_context(struct llama_context * ctx) {
ctx->t_start_us = ggml_time_us();
ctx->t_eval_us = ctx->n_eval = 0;
ctx->t_p_eval_us = ctx->n_p_eval = 0;
}
p->t_start_us = ggml_time_us();
p->t_eval_us = p->n_eval = 0;
p->t_p_eval_us = p->n_p_eval = 0;
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
auto * smpl = (struct llama_sampler *) ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;
void llama_perf_reset_sampler(struct llama_sampler * chain) {
auto * p = (struct llama_sampler_chain *) chain->ctx;
p->t_sample_us = p->n_sample = 0;
} break;
default:
GGML_ABORT("invalid perf type");
}
}
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {