Compare commits

...

11 Commits

Author SHA1 Message Date
Georgi Gerganov
36195c578e
Merge f42de2426e into 8db003a19d 2024-09-11 15:41:18 +03:00
Pavel Zloi
8db003a19d
py : support converting local models (#7547)
* Support of converting local models added to convert-hf-to-gguf-update.py

* Description fixed

* shutil added to imports
2024-09-11 15:29:51 +03:00
Xuan Son Nguyen
0996c5597f
llava : correct args for minicpmv-cli (#9429) 2024-09-11 12:59:13 +02:00
Xuan Son Nguyen
5bb2c5dbd2
files : remove accidentally added lora_test submodule (#9430) 2024-09-11 13:02:09 +03:00
Farbod Bijary
67155ab7f5
feat: Implements retrying logic for downloading models using --model-url flag (#9255)
* feat: Implements retrying logic for downloading models using --model-url flag

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* apply comments

* implements a retry function to avoid duplication

* fix editorconfig

* change function name

---------

Co-authored-by: farbod <farbod.bjary82@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-09-11 11:22:37 +02:00
Johannes Gäßler
5af118efda
CUDA: fix --split-mode row race condition (#9413) 2024-09-11 10:22:40 +02:00
Georgi Gerganov
f42de2426e
perf : separate functions in the API
ggml-ci
2024-09-11 09:56:41 +03:00
Georgi Gerganov
fd46535314
Update src/llama.cpp
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2024-09-10 13:03:54 +03:00
Georgi Gerganov
6cce78c2ed
Merge branch 'master' into gg/llama-perf
ggml-ci
2024-09-10 11:38:09 +03:00
Georgi Gerganov
ade52b6cc6
common : add llama_arg 2024-09-08 08:57:56 +03:00
Georgi Gerganov
471e7e1e59
llama : llama_perf + option to disable timings during decode
ggml-ci
2024-09-08 08:54:34 +03:00
25 changed files with 193 additions and 111 deletions

View File

@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.prompt = value; params.prompt = value;
} }
)); ));
add_opt(llama_arg(
{"--no-perf"},
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](gpt_params & params) {
params.no_perf = true;
params.sparams.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
add_opt(llama_arg( add_opt(llama_arg(
{"-f", "--file"}, "FNAME", {"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)", "a file containing the prompt (default: none)",

View File

@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
} }
llama_kv_cache_clear(lctx); llama_kv_cache_clear(lctx);
llama_synchronize(lctx); llama_synchronize(lctx);
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_reset_context(lctx);
} }
iparams.model = model; iparams.model = model;
@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload; cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn; cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v); cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@ -941,11 +942,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) { static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with... // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
@ -1049,9 +1076,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
CURLcode res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
@ -1126,11 +1152,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}; };
// start the download // start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }

View File

@ -124,6 +124,7 @@ struct gpt_sampler_params {
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = { std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K, GPT_SAMPLER_TYPE_TOP_K,
@ -246,6 +247,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch

View File

@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = false; // TODO: control via params lparams.no_perf = params.no_perf;
auto * result = new gpt_sampler { auto * result = new gpt_sampler {
/* .params = */ params, /* .params = */ params,
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// TODO: measure grammar performance // TODO: measure grammar performance
if (gsmpl) { if (gsmpl) {
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); llama_perf_print_sampler(gsmpl->chain);
} }
if (ctx) { if (ctx) {
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
} }
} }

View File

@ -31,6 +31,7 @@ import re
import requests import requests
import sys import sys
import json import json
import shutil
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
@ -125,6 +126,21 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM: if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model") files.append("spiece.model")
if os.path.isdir(repo):
# If repo is a path on the file system, copy the directory
for file in files:
src_path = os.path.join(repo, file)
dst_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files: for file in files:
save_path = f"models/tokenizers/{name}/{file}" save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path): if os.path.isfile(save_path):

View File

@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
llama_batch_free(batch); llama_batch_free(batch);

View File

@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) llama_perf_print_sampler(smpl)
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) llama_perf_print_context(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count

View File

@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); llama_perf_print_sampler(smpl);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View File

@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
// clean up // clean up
llama_batch_free(batch); llama_batch_free(batch);

View File

@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View File

@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix(); g_collector.save_imatrix();
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View File

@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
fflush(p_err->fout); fflush(p_err->fout);
} }
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
llama_free(ctx); llama_free(ctx);

View File

@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);
@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);

View File

@ -18,8 +18,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1; return 1;
} }
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
} }
} }
printf("\n"); printf("\n");
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx_llava->ctx_llama);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);

View File

@ -240,8 +240,7 @@ int main(int argc, char ** argv){
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n\n"); LOG_TEE("\ntarget:\n\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); gpt_perf_print(ctx, smpl);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
gpt_sampler_free(smpl); gpt_sampler_free(smpl);

View File

@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
// TODO: print sampling/grammar timings for all clients // TODO: print sampling/grammar timings for all clients
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
llama_batch_free(batch); llama_batch_free(batch);

View File

@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View File

@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
write_logfile(ctx, params, model, results); write_logfile(ctx, params, model, results);
llama_free(ctx); llama_free(ctx);

View File

@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
// clean up // clean up
llama_batch_free(query_batch); llama_batch_free(query_batch);

View File

@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); llama_perf_print_sampler(smpl);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View File

@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\ndraft:\n\n"); LOG_TEE("\ndraft:\n\n");
// TODO: print sampling/grammar timings for all drafts // TODO: print sampling/grammar timings for all drafts
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); llama_perf_print_context(ctx_dft);
LOG_TEE("\ntarget:\n\n"); LOG_TEE("\ntarget:\n\n");
gpt_perf_print(ctx_tgt, smpl); gpt_perf_print(ctx_tgt, smpl);

View File

@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; // The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:

View File

@ -2742,6 +2742,7 @@ struct mmq_args {
int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne0; int64_t ne0;
bool use_stream_k;
}; };
template<ggml_type type> template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
const dim3 block_nums_xy_tiling(nty, ntx, 1); const dim3 block_nums_xy_tiling(nty, ntx, 1);
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; if (!args.use_stream_k) {
if (!use_stream_k) {
if (args.ne01 % mmq_y == 0) { if (args.ne01 % mmq_y == 0) {
constexpr bool need_check = false; constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>> mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>

View File

@ -343,7 +343,7 @@ extern "C" {
bool embeddings; // if true, extract embeddings (together with logits) bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
//bool no_perf; // whether to measure performance timings, TODO: implement bool no_perf; // whether to measure performance timings
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@ -1173,13 +1173,30 @@ extern "C" {
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
// //
enum llama_perf_type { struct llama_perf_data_context {
LLAMA_PERF_TYPE_CONTEXT = 0, double t_start_ms;
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_p_eval;
int32_t n_eval;
}; };
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); struct llama_perf_data_sampler {
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); double t_sample_ms;
int32_t n_sample;
};
LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);
LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);
LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

View File

@ -2482,6 +2482,7 @@ struct llama_cparams {
bool causal_attn; bool causal_attn;
bool offload_kqv; bool offload_kqv;
bool flash_attn; bool flash_attn;
bool no_perf;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
@ -6657,8 +6658,6 @@ static bool llm_load_tensors(
bool use_mlock, bool use_mlock,
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void * progress_callback_user_data) { void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
auto & hparams = model.hparams; auto & hparams = model.hparams;
model.split_mode = split_mode; model.split_mode = split_mode;
@ -8589,14 +8588,13 @@ static bool llm_load_tensors(
} }
} }
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true; return true;
} }
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
model.t_start_us = ggml_time_us();
try { try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
return -1; return -1;
} }
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return 0; return 0;
} }
@ -17941,6 +17943,7 @@ struct llama_context_params llama_context_default_params() {
/*.embeddings =*/ false, /*.embeddings =*/ false,
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr, /*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr, /*.abort_callback_data =*/ nullptr,
}; };
@ -18151,6 +18154,7 @@ struct llama_context * llama_new_context_with_model(
cparams.embeddings = params.embeddings; cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv; cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn; cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@ -20069,10 +20073,14 @@ void llama_synchronize(struct llama_context * ctx) {
// add the evaluation to the stats // add the evaluation to the stats
if (ctx->n_queued_tokens == 1) { if (ctx->n_queued_tokens == 1) {
if (!ctx->cparams.no_perf) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_eval++; ctx->n_eval++;
} else if (ctx->n_queued_tokens > 1) { } else if (ctx->n_queued_tokens > 1) {
if (!ctx->cparams.no_perf) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_p_eval += ctx->n_queued_tokens; ctx->n_p_eval += ctx->n_queued_tokens;
} }
@ -20679,65 +20687,68 @@ const char * llama_print_system_info(void) {
return s.c_str(); return s.c_str();
} }
void llama_perf_print(const void * ctx, enum llama_perf_type type) { struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) {
switch (type) { struct llama_perf_data_context data = {};
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto * p = (const struct llama_context *) ctx;
const double t_start_ms = 1e-3 * p->t_start_us; if (ctx == nullptr) {
const double t_end_ms = 1.00 * ggml_time_ms(); return data;
const double t_load_ms = 1e-3 * p->t_load_us; }
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
const double t_eval_ms = 1e-3 * p->t_eval_us;
const int32_t n_p_eval = std::max(0, p->n_p_eval); data.t_start_ms = 1e-3 * ctx->t_start_us;
const int32_t n_eval = std::max(1, p->n_eval); data.t_load_ms = 1e-3 * ctx->t_load_us;
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
data.n_p_eval = std::max(1, ctx->n_p_eval);
data.n_eval = std::max(1, ctx->n_eval);
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); return data;
}
struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) {
struct llama_perf_data_sampler data = {};
if (chain == nullptr) {
return data;
}
const auto * p = (const struct llama_sampler_chain *) chain->ctx;
data.t_sample_ms = 1e-3 * p->t_sample_us;
data.n_sample = std::max(0, p->n_sample);
return data;
}
void llama_perf_print_context(const struct llama_context * ctx) {
const auto data = llama_perf_context(ctx);
const double t_end_ms = 1e-3 * ggml_time_us();
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
} break; }
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
const double t_sampler_ms = 1e-3 * p->t_sample_us; void llama_perf_print_sampler(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain);
const int32_t n_sampler = std::max(0, p->n_sample);
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
} break;
default:
GGML_ABORT("invalid perf type");
}
} }
void llama_perf_reset(void * ctx, enum llama_perf_type type) { void llama_perf_reset_context(struct llama_context * ctx) {
switch (type) { ctx->t_start_us = ggml_time_us();
case LLAMA_PERF_TYPE_CONTEXT: ctx->t_eval_us = ctx->n_eval = 0;
{ ctx->t_p_eval_us = ctx->n_p_eval = 0;
auto * p = (struct llama_context *) ctx; }
p->t_start_us = ggml_time_us(); void llama_perf_reset_sampler(struct llama_sampler * chain) {
p->t_eval_us = p->n_eval = 0; auto * p = (struct llama_sampler_chain *) chain->ctx;
p->t_p_eval_us = p->n_p_eval = 0;
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
auto * smpl = (struct llama_sampler *) ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;
p->t_sample_us = p->n_sample = 0; p->t_sample_us = p->n_sample = 0;
} break;
default:
GGML_ABORT("invalid perf type");
}
} }
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {