From 10eb87409ec0797ec79dec87f1004b380e094cfd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 16:09:49 +0200 Subject: [PATCH] shadow : cont gcc ggml-ci --- common/arg.cpp | 936 +++++++++++------------ common/json-schema-to-grammar.cpp | 4 +- common/log.cpp | 4 +- examples/batched-bench/batched-bench.cpp | 8 +- examples/llava/clip.cpp | 8 +- examples/llava/clip.h | 6 +- examples/llava/llava.cpp | 6 +- examples/server/server.cpp | 16 +- examples/simple-chat/simple-chat.cpp | 3 +- src/llama-model.cpp | 13 +- src/llama-quant.cpp | 8 +- src/llama-vocab.cpp | 8 +- 12 files changed, 509 insertions(+), 511 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b551596df..d1faccee1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -383,8 +383,8 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e } exit(0); } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); + } catch (const std::invalid_argument & e) { + fprintf(stderr, "%s\n", e.what()); ctx_arg.params = params_org; return false; } @@ -438,8 +438,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](common_params & params) { - params.usage = true; + [](common_params & cur) { + cur.usage = true; } )); add_opt(common_arg( @@ -454,50 +454,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](common_params & params) { - params.verbose_prompt = true; + [](common_params & cur) { + cur.verbose_prompt = true; } )); add_opt(common_arg( {"--no-display-prompt"}, string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](common_params & params) { - params.display_prompt = false; + [](common_params & cur) { + cur.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-co", "--color"}, string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](common_params & params) { - params.use_color = true; + [](common_params & cur) { + cur.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](common_params & params, int value) { - params.cpuparams.n_threads = value; - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.cpuparams.n_threads = value; + if (cur.cpuparams.n_threads <= 0) { + cur.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_env("LLAMA_ARG_THREADS")); add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](common_params & params, int value) { - params.cpuparams_batch.n_threads = value; - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.cpuparams_batch.n_threads = value; + if (cur.cpuparams_batch.n_threads <= 0) { + cur.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } )); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](common_params & params, const std::string & mask) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, cur.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -505,9 +505,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](common_params & params, const std::string & range) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, cur.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -515,33 +515,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict"}, "<0|1>", string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](common_params & params, const std::string & value) { - params.cpuparams.strict_cpu = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(common_arg( {"--prio"}, "N", string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.cpuparams.priority = (enum ggml_sched_priority) prio; + cur.cpuparams.priority = (enum ggml_sched_priority) prio; } )); add_opt(common_arg( {"--poll"}, "<0...100>", string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](common_params & params, const std::string & value) { - params.cpuparams.poll = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.cpuparams.poll = std::stoul(value); } )); add_opt(common_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, cur.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -549,9 +549,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](common_params & params, const std::string & range) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, cur.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -559,95 +559,95 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.cpuparams_batch.strict_cpu = value; + [](common_params & cur, int value) { + cur.cpuparams_batch.strict_cpu = value; } )); add_opt(common_arg( {"--prio-batch"}, "N", string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + cur.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } )); add_opt(common_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](common_params & params, int value) { - params.cpuparams_batch.poll = value; + [](common_params & cur, int value) { + cur.cpuparams_batch.poll = value; } )); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_static = value; + [](common_params & cur, const std::string & value) { + cur.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_dynamic = value; + [](common_params & cur, const std::string & value) { + cur.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](common_params & params, int value) { - params.n_ctx = value; + [](common_params & cur, int value) { + cur.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](common_params & params, int value) { - params.n_predict = value; + [](common_params & cur, int value) { + cur.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(common_arg( {"-b", "--batch-size"}, "N", string_format("logical maximum batch size (default: %d)", params.n_batch), - [](common_params & params, int value) { - params.n_batch = value; + [](common_params & cur, int value) { + cur.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", string_format("physical maximum batch size (default: %d)", params.n_ubatch), - [](common_params & params, int value) { - params.n_ubatch = value; + [](common_params & cur, int value) { + cur.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(common_arg( {"--keep"}, "N", string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](common_params & params, int value) { - params.n_keep = value; + [](common_params & cur, int value) { + cur.n_keep = value; } )); add_opt(common_arg( {"--no-context-shift"}, string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](common_params & params) { - params.ctx_shift = false; + [](common_params & cur) { + cur.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](common_params & params, int value) { - params.n_chunks = value; + [](common_params & cur, int value) { + cur.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](common_params & params) { - params.flash_attn = true; + [](common_params & cur) { + cur.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(common_arg( @@ -655,115 +655,115 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](common_params & params, const std::string & value) { - params.prompt = value; + [](common_params & cur, const std::string & value) { + cur.prompt = value; } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](common_params & params) { - params.no_perf = true; - params.sampling.no_perf = true; + [](common_params & cur) { + cur.no_perf = true; + cur.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - // store the external file name in params - params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); + // store the external file name in cur + cur.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(cur.prompt)); + if (!cur.prompt.empty() && cur.prompt.back() == '\n') { + cur.prompt.pop_back(); } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - params.in_files.push_back(value); + cur.in_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - // store the external file name in params - params.prompt_file = value; + // store the external file name in cur + cur.prompt_file = value; std::ostringstream ss; ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + cur.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", cur.prompt.size(), value.c_str()); } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](common_params & params) { - params.escape = true; + [](common_params & cur) { + cur.escape = true; } )); add_opt(common_arg( {"--no-escape"}, "do not process escape sequences", - [](common_params & params) { - params.escape = false; + [](common_params & cur) { + cur.escape = false; } )); add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", string_format("print token count every N tokens (default: %d)", params.n_print), - [](common_params & params, int value) { - params.n_print = value; + [](common_params & cur, int value) { + cur.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](common_params & params, const std::string & value) { - params.path_prompt_cache = value; + [](common_params & cur, const std::string & value) { + cur.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](common_params & params) { - params.prompt_cache_all = true; + [](common_params & cur) { + cur.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](common_params & params) { - params.prompt_cache_ro = true; + [](common_params & cur) { + cur.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](common_params & params, const std::string & value) { - params.antiprompt.emplace_back(value); + [](common_params & cur, const std::string & value) { + cur.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-sp", "--special"}, string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](common_params & params) { - params.special = true; + [](common_params & cur) { + cur.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -775,60 +775,60 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: %s)", params.conversation ? "true" : "false" ), - [](common_params & params) { - params.conversation = true; + [](common_params & cur) { + cur.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](common_params & params) { - params.interactive = true; + [](common_params & cur) { + cur.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-if", "--interactive-first"}, string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](common_params & params) { - params.interactive_first = true; + [](common_params & cur) { + cur.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](common_params & params) { - params.multiline_input = true; + [](common_params & cur) { + cur.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](common_params & params) { - params.input_prefix_bos = true; - params.enable_chat_template = false; + [](common_params & cur) { + cur.input_prefix_bos = true; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_prefix = value; - params.enable_chat_template = false; + [](common_params & cur, const std::string & value) { + cur.input_prefix = value; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_suffix = value; - params.enable_chat_template = false; + [](common_params & cur, const std::string & value) { + cur.input_suffix = value; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](common_params & params) { - params.warmup = false; + [](common_params & cur) { + cur.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -837,154 +837,154 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](common_params & params) { - params.spm_infill = true; + [](common_params & cur) { + cur.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--samplers"}, "SAMPLERS", string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); + cur.sampling.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), - [](common_params & params, const std::string & value) { - params.sampling.seed = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.sampling.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.samplers = common_sampler_types_from_chars(value); + [](common_params & cur, const std::string & value) { + cur.sampling.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](common_params & params) { - params.sampling.ignore_eos = true; + [](common_params & cur) { + cur.sampling.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.1f)", (double)params.sampling.temp), - [](common_params & params, const std::string & value) { - params.sampling.temp = std::stof(value); - params.sampling.temp = std::max(params.sampling.temp, 0.0f); + [](common_params & cur, const std::string & value) { + cur.sampling.temp = std::stof(value); + cur.sampling.temp = std::max(cur.sampling.temp, 0.0f); } ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), - [](common_params & params, int value) { - params.sampling.top_k = value; + [](common_params & cur, int value) { + cur.sampling.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), - [](common_params & params, const std::string & value) { - params.sampling.top_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), - [](common_params & params, const std::string & value) { - params.sampling.min_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), - [](common_params & params, const std::string & value) { - params.sampling.xtc_probability = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.xtc_probability = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), - [](common_params & params, const std::string & value) { - params.sampling.xtc_threshold = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.xtc_threshold = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), - [](common_params & params, const std::string & value) { - params.sampling.typ_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), - [](common_params & params, int value) { + [](common_params & cur, int value) { if (value < -1) { throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); } - params.sampling.penalty_last_n = value; - params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); + cur.sampling.penalty_last_n = value; + cur.sampling.n_prev = std::max(cur.sampling.n_prev, cur.sampling.penalty_last_n); } ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), - [](common_params & params, const std::string & value) { - params.sampling.penalty_repeat = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), - [](common_params & params, const std::string & value) { - params.sampling.penalty_present = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), - [](common_params & params, const std::string & value) { - params.sampling.penalty_freq = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), - [](common_params & params, const std::string & value) { - params.sampling.dry_multiplier = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dry_multiplier = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-base"}, "N", string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { float potential_base = std::stof(value); if (potential_base >= 1.0f) { - params.sampling.dry_base = potential_base; + cur.sampling.dry_base = potential_base; } } ).set_sparam()); add_opt(common_arg( {"--dry-allowed-length"}, "N", string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), - [](common_params & params, int value) { - params.sampling.dry_allowed_length = value; + [](common_params & cur, int value) { + cur.sampling.dry_allowed_length = value; } ).set_sparam()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), - [](common_params & params, int value) { + [](common_params & cur, int value) { if (value < -1) { throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); } - params.sampling.dry_penalty_last_n = value; + cur.sampling.dry_penalty_last_n = value; } ).set_sparam()); add_opt(common_arg( @@ -998,55 +998,55 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string formatted_b = (b == "\n") ? "\\n" : b; return a + ", '" + formatted_b + "'"; }).c_str()), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { static bool defaults_cleared = false; if (!defaults_cleared) { - params.sampling.dry_sequence_breakers.clear(); + cur.sampling.dry_sequence_breakers.clear(); defaults_cleared = true; } if (value == "none") { - params.sampling.dry_sequence_breakers.clear(); + cur.sampling.dry_sequence_breakers.clear(); } else { - params.sampling.dry_sequence_breakers.emplace_back(value); + cur.sampling.dry_sequence_breakers.emplace_back(value); } } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_range = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_exponent = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), - [](common_params & params, int value) { - params.sampling.mirostat = value; + [](common_params & cur, int value) { + cur.sampling.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_eta = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_tau = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.mirostat_tau = std::stof(value); } ).set_sparam()); add_opt(common_arg( @@ -1054,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1062,7 +1062,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sampling.logit_bias.push_back({key, bias}); + cur.sampling.logit_bias.push_back({key, bias}); } else { throw std::invalid_argument("invalid input format"); } @@ -1074,14 +1074,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--grammar"}, "GRAMMAR", string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.grammar = value; + [](common_params & cur, const std::string & value) { + cur.sampling.grammar = value; } ).set_sparam()); add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); @@ -1089,130 +1089,130 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sampling.grammar) + std::back_inserter(cur.sampling.grammar) ); } ).set_sparam()); add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](common_params & params, const std::string & value) { - params.sampling.grammar = json_schema_to_grammar(json::parse(value)); + [](common_params & cur, const std::string & value) { + cur.sampling.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "none") { cur.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { cur.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { cur.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { cur.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else if (value == "rank") { cur.pooling_type = LLAMA_POOLING_TYPE_RANK; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); add_opt(common_arg( {"--attention"}, "{causal,non-causal}", "attention type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "none") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = 1.0f / std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_scale = 1.0f / std::stof(value); } ).set_env("LLAMA_ARG_ROPE_SCALE")); add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](common_params & params, const std::string & value) { - params.rope_freq_base = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_base = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_scale = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); add_opt(common_arg( {"--yarn-orig-ctx"}, "N", string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](common_params & params, int value) { - params.yarn_orig_ctx = value; + [](common_params & cur, int value) { + cur.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](common_params & params, const std::string & value) { - params.yarn_ext_factor = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](common_params & params, const std::string & value) { - params.yarn_attn_factor = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](common_params & params, const std::string & value) { - params.yarn_beta_slow = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](common_params & params, const std::string & value) { - params.yarn_beta_fast = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", string_format("group-attention factor (default: %d)", params.grp_attn_n), - [](common_params & params, int value) { - params.grp_attn_n = value; + [](common_params & cur, int value) { + cur.grp_attn_n = value; } ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", string_format("group-attention width (default: %d)", params.grp_attn_w), - [](common_params & params, int value) { - params.grp_attn_w = value; + [](common_params & cur, int value) { + cur.grp_attn_w = value; } ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](common_params & params) { - params.dump_kv_cache = true; + [](common_params & cur) { + cur.dump_kv_cache = true; } )); add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](common_params & params) { - params.no_kv_offload = true; + [](common_params & cur) { + cur.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( @@ -1224,8 +1224,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex get_all_kv_cache_types().c_str(), ggml_type_name(params.cache_type_k) ), - [](common_params & params, const std::string & value) { - params.cache_type_k = kv_cache_type_from_str(value); + [](common_params & cur, const std::string & value) { + cur.cache_type_k = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( @@ -1237,157 +1237,157 @@ common_params_context common_params_parser_init(common_params & params, llama_ex get_all_kv_cache_types().c_str(), ggml_type_name(params.cache_type_v) ), - [](common_params & params, const std::string & value) { - params.cache_type_v = kv_cache_type_from_str(value); + [](common_params & cur, const std::string & value) { + cur.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( {"--perplexity", "--all-logits"}, string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](common_params & params) { - params.logits_all = true; + [](common_params & cur) { + cur.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.hellaswag = true; + [](common_params & cur) { + cur.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](common_params & params, int value) { - params.hellaswag_tasks = value; + [](common_params & cur, int value) { + cur.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.winogrande = true; + [](common_params & cur) { + cur.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande-tasks"}, "N", string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](common_params & params, int value) { - params.winogrande_tasks = value; + [](common_params & cur, int value) { + cur.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.multiple_choice = true; + [](common_params & cur) { + cur.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](common_params & params, int value) { - params.multiple_choice_tasks = value; + [](common_params & cur, int value) { + cur.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](common_params & params) { - params.kl_divergence = true; + [](common_params & cur) { + cur.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", - [](common_params & params, const std::string & value) { - params.logits_file = value; + [](common_params & cur, const std::string & value) { + cur.logits_file = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-stride"}, "N", string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](common_params & params, int value) { - params.ppl_stride = value; + [](common_params & cur, int value) { + cur.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](common_params & params, int value) { - params.ppl_output_type = value; + [](common_params & cur, int value) { + cur.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](common_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](common_params & params, int value) { - params.n_parallel = value; + [](common_params & cur, int value) { + cur.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(common_arg( {"-ns", "--sequences"}, "N", string_format("number of sequences to decode (default: %d)", params.n_sequences), - [](common_params & params, int value) { - params.n_sequences = value; + [](common_params & cur, int value) { + cur.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](common_params & params) { - params.cont_batching = true; + [](common_params & cur) { + cur.cont_batching = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](common_params & params) { - params.cont_batching = false; + [](common_params & cur) { + cur.cont_batching = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](common_params & params, const std::string & value) { - params.mmproj = value; + [](common_params & cur, const std::string & value) { + cur.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](common_params & params, const std::string & value) { - params.image.emplace_back(value); + [](common_params & cur, const std::string & value) { + cur.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", - [](common_params & params, const std::string & value) { - params.rpc_servers = value; + [](common_params & cur, const std::string & value) { + cur.rpc_servers = value; } ).set_env("LLAMA_ARG_RPC")); } add_opt(common_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](common_params & params) { - params.use_mlock = true; + [](common_params & cur) { + cur.use_mlock = true; } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](common_params & params) { - params.use_mmap = false; + [](common_params & cur) { + cur.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); add_opt(common_arg( @@ -1398,10 +1398,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](common_params & params, const std::string & value) { - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "distribute" || value == "") { cur.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { cur.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { cur.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); @@ -1409,8 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-dev", "--device"}, "", "comma-separated list of devices to use for offloading (none = don't offload)\n" "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.devices = parse_device_list(value); + [](common_params & cur, const std::string & value) { + cur.devices = parse_device_list(value); } ).set_env("LLAMA_ARG_DEVICE")); add_opt(common_arg( @@ -1432,8 +1432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", - [](common_params & params, int value) { - params.n_gpu_layers = value; + [](common_params & cur, int value) { + cur.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); @@ -1447,14 +1447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + cur.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + cur.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; + cur.split_mode = LLAMA_SPLIT_MODE_ROW; } else { throw std::invalid_argument("invalid value"); } @@ -1466,7 +1466,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1480,9 +1480,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + cur.tensor_split[i] = std::stof(split_arg[i]); } else { - params.tensor_split[i] = 0.0f; + cur.tensor_split[i] = 0.0f; } } if (!llama_supports_gpu_offload()) { @@ -1493,8 +1493,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](common_params & params, int value) { - params.main_gpu = value; + [](common_params & cur, int value) { + cur.main_gpu = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); } @@ -1503,16 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](common_params & params) { - params.check_tensors = true; + [](common_params & cur) { + cur.check_tensors = true; } )); add_opt(common_arg( {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](common_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + [](common_params & cur, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), cur.kv_overrides)) { throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } @@ -1520,47 +1520,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); + [](common_params & cur, const std::string & value) { + cur.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); + [](common_params & cur, const std::string & fname, const std::string & scale) { + cur.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](common_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); + [](common_params & cur, const std::string & value) { + cur.control_vectors.push_back({ 1.0f, value, }); } )); add_opt(common_arg( {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); + [](common_params & cur, const std::string & fname, const std::string & scale) { + cur.control_vectors.push_back({ std::stof(scale), fname }); } )); add_opt(common_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](common_params & params, const std::string & start, const std::string & end) { - params.control_vector_layer_start = std::stoi(start); - params.control_vector_layer_end = std::stoi(end); + [](common_params & cur, const std::string & start, const std::string & end) { + cur.control_vector_layer_start = std::stoi(start); + cur.control_vector_layer_end = std::stoi(end); } )); add_opt(common_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](common_params & params, const std::string & value) { - params.model_alias = value; + [](common_params & cur, const std::string & value) { + cur.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); add_opt(common_arg( @@ -1571,89 +1571,89 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](common_params & params, const std::string & value) { - params.model = value; + [](common_params & cur, const std::string & value) { + cur.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](common_params & params, const std::string & value) { - params.model_url = value; + [](common_params & cur, const std::string & value) { + cur.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](common_params & params, const std::string & value) { - params.hf_repo = value; + [](common_params & cur, const std::string & value) { + cur.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](common_params & params, const std::string & value) { - params.hf_file = value; + [](common_params & cur, const std::string & value) { + cur.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( {"-hfrv", "--hf-repo-v"}, "REPO", "Hugging Face model repository for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_repo = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO_V")); add_opt(common_arg( {"-hffv", "--hf-file-v"}, "FILE", "Hugging Face model file for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_file = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](common_params & params, const std::string & value) { - params.hf_token = value; + [](common_params & cur, const std::string & value) { + cur.hf_token = value; } ).set_env("HF_TOKEN")); add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - params.context_files.push_back(value); + cur.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-size"}, "N", string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](common_params & params, int value) { - params.chunk_size = value; + [](common_params & cur, int value) { + cur.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-separator"}, "STRING", string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](common_params & params, const std::string & value) { - params.chunk_separator = value; + [](common_params & cur, const std::string & value) { + cur.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--junk"}, "N", string_format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](common_params & params, int value) { - params.n_junk = value; + [](common_params & cur, int value) { + cur.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"--pos"}, "N", string_format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](common_params & params, int value) { - params.i_pos = value; + [](common_params & cur, int value) { + cur.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( @@ -1664,152 +1664,152 @@ common_params_context common_params_parser_init(common_params & params, llama_ex : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](common_params & params, const std::string & value) { - params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; + [](common_params & cur, const std::string & value) { + cur.out_file = value; + cur.cvector_outfile = value; + cur.lora_outfile = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](common_params & params, int value) { - params.n_out_freq = value; + [](common_params & cur, int value) { + cur.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](common_params & params, int value) { - params.n_save_freq = value; + [](common_params & cur, int value) { + cur.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--process-output"}, string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](common_params & params) { - params.process_output = true; + [](common_params & cur) { + cur.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--no-ppl"}, string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](common_params & params) { - params.compute_ppl = false; + [](common_params & cur) { + cur.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", string_format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](common_params & params, int value) { - params.i_chunk = value; + [](common_params & cur, int value) { + cur.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](common_params & params) { - params.is_pp_shared = true; + [](common_params & cur) { + cur.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + cur.n_pp.insert(cur.n_pp.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + cur.n_tg.insert(cur.n_tg.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + cur.n_pl.insert(cur.n_pl.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"--embd-normalize"}, "N", string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](common_params & params, int value) { - params.embd_normalize = value; + [](common_params & cur, int value) { + cur.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](common_params & params, const std::string & value) { - params.embd_out = value; + [](common_params & cur, const std::string & value) { + cur.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-separator"}, "STRING", "separator of embeddings (default \\n) for example \"<#sep#>\"", - [](common_params & params, const std::string & value) { - params.embd_sep = value; + [](common_params & cur, const std::string & value) { + cur.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", string_format("ip address to listen (default: %s)", params.hostname.c_str()), - [](common_params & params, const std::string & value) { - params.hostname = value; + [](common_params & cur, const std::string & value) { + cur.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", string_format("port to listen (default: %d)", params.port), - [](common_params & params, int value) { - params.port = value; + [](common_params & cur, int value) { + cur.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", string_format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](common_params & params, const std::string & value) { - params.public_path = value; + [](common_params & cur, const std::string & value) { + cur.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); add_opt(common_arg( {"--no-webui"}, string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), - [](common_params & params) { - params.webui = false; + [](common_params & cur) { + cur.webui = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](common_params & params) { - params.embedding = true; + [](common_params & cur) { + cur.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), - [](common_params & params) { - params.reranking = true; + [](common_params & cur) { + cur.reranking = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](common_params & params, const std::string & value) { - params.api_keys.push_back(value); + [](common_params & cur, const std::string & value) { + cur.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream key_file(value); if (!key_file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); @@ -1817,7 +1817,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string key; while (std::getline(key_file, key)) { if (!key.empty()) { - params.api_keys.push_back(key); + cur.api_keys.push_back(key); } } key_file.close(); @@ -1826,75 +1826,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](common_params & params, const std::string & value) { - params.ssl_file_key = value; + [](common_params & cur, const std::string & value) { + cur.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](common_params & params, const std::string & value) { - params.ssl_file_cert = value; + [](common_params & cur, const std::string & value) { + cur.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](common_params & params, int value) { - params.timeout_read = value; - params.timeout_write = value; + [](common_params & cur, int value) { + cur.timeout_read = value; + cur.timeout_write = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); add_opt(common_arg( {"--threads-http"}, "N", string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](common_params & params, int value) { - params.n_threads_http = value; + [](common_params & cur, int value) { + cur.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(common_arg( {"--cache-reuse"}, "N", string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), - [](common_params & params, int value) { - params.n_cache_reuse = value; + [](common_params & cur, int value) { + cur.n_cache_reuse = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_metrics = true; + [](common_params & cur) { + cur.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(common_arg( {"--slots"}, string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_slots = true; + [](common_params & cur) { + cur.endpoint_slots = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--props"}, string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_props = true; + [](common_params & cur) { + cur.endpoint_props = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--no-slots"}, "disables slots monitoring endpoint", - [](common_params & params) { - params.endpoint_slots = false; + [](common_params & cur) { + cur.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](common_params & params, const std::string & value) { - params.slot_save_path = value; + [](common_params & cur, const std::string & value) { + cur.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; + if (!cur.slot_save_path.empty() && cur.slot_save_path[cur.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + cur.slot_save_path += DIRECTORY_SEPARATOR; } } ).set_examples({LLAMA_EXAMPLE_SERVER})); @@ -1905,7 +1905,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "if suffix/prefix are specified, template will be disabled\n" "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() ), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { if (!common_chat_verify_template(value)) { throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" @@ -1913,73 +1913,73 @@ common_params_context common_params_parser_init(common_params & params, llama_ex value.c_str() )); } - params.chat_template = value; + cur.chat_template = value; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](common_params & params, const std::string & value) { - params.slot_prompt_similarity = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--lora-init-without-apply"}, string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](common_params & params) { - params.lora_init_without_apply = true; + [](common_params & cur) { + cur.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](common_params & params) { - params.simple_io = true; + [](common_params & cur) { + cur.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_positive_file = value; + [](common_params & cur, const std::string & value) { + cur.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--negative-file"}, "FNAME", string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_negative_file = value; + [](common_params & cur, const std::string & value) { + cur.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-batch"}, "N", string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](common_params & params, int value) { - params.n_pca_batch = value; + [](common_params & cur, int value) { + cur.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-iter"}, "N", string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](common_params & params, int value) { - params.n_pca_iterations = value; + [](common_params & cur, int value) { + cur.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](common_params & params, const std::string & value) { - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "pca") { cur.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { cur.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](common_params & params, const std::string & value) { - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "jsonl") { cur.batched_bench_output_jsonl = true; } + else if (value == "md") { cur.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); @@ -2007,16 +2007,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](common_params & params) { - params.verbosity = INT_MAX; + [](common_params & cur) { + cur.verbosity = INT_MAX; common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](common_params & params, int value) { - params.verbosity = value; + [](common_params & cur, int value) { + cur.verbosity = value; common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); @@ -2039,29 +2039,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.speculative.cpuparams.n_threads = value; - if (params.speculative.cpuparams.n_threads <= 0) { - params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.speculative.cpuparams.n_threads = value; + if (cur.speculative.cpuparams.n_threads <= 0) { + cur.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.n_threads = value; - if (params.speculative.cpuparams_batch.n_threads <= 0) { - params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.n_threads = value; + if (cur.speculative.cpuparams_batch.n_threads <= 0) { + cur.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, cur.speculative.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2069,9 +2069,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, cur.speculative.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -2079,33 +2079,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.speculative.cpuparams.strict_cpu = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + cur.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.speculative.cpuparams.poll = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, cur.speculative.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2113,9 +2113,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, cur.speculative.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2123,75 +2123,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.strict_cpu = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-batch-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + cur.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.poll = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--draft-max", "--draft", "--draft-n"}, "N", string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), - [](common_params & params, int value) { - params.speculative.n_max = value; + [](common_params & cur, int value) { + cur.speculative.n_max = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), - [](common_params & params, int value) { - params.speculative.n_min = value; + [](common_params & cur, int value) { + cur.speculative.n_min = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), - [](common_params & params, const std::string & value) { - params.speculative.p_split = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.speculative.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), - [](common_params & params, const std::string & value) { - params.speculative.p_min = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.speculative.p_min = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), - [](common_params & params, int value) { - params.speculative.n_ctx = value; + [](common_params & cur, int value) { + cur.speculative.n_ctx = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.speculative.devices = parse_device_list(value); + [](common_params & cur, const std::string & value) { + cur.speculative.devices = parse_device_list(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.speculative.n_gpu_layers = value; + [](common_params & cur, int value) { + cur.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); @@ -2202,16 +2202,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.speculative.model = value; + [](common_params & cur, const std::string & value) { + cur.speculative.model = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); add_opt(common_arg( {"-mv", "--model-vocoder"}, "FNAME", "vocoder model for audio generation (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.model = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.model = value; } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); @@ -2219,11 +2219,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--tts-oute-default"}, string_format("use default OuteTTS models (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; - params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; - params.vocoder.hf_repo = "ggml-org/WavTokenizer"; - params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + [](common_params & cur) { + cur.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + cur.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + cur.vocoder.hf_repo = "ggml-org/WavTokenizer"; + cur.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; } ).set_examples({LLAMA_EXAMPLE_TTS})); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index dadc18c8b..5bf67ecc1 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -579,8 +579,8 @@ private: seq.back().second = false; } else { std::string literal; - auto is_non_literal = [&](char c) { - return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end(); + auto is_non_literal = [&](char ch) { + return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end(); }; while (i < length) { if (sub_pattern[i] == '\\' && i < length - 1) { diff --git a/common/log.cpp b/common/log.cpp index 7a94bf7f9..76715d629 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -255,8 +255,8 @@ public: thrd = std::thread([this]() { while (true) { { - std::unique_lock lock(mtx); - cv.wait(lock, [this]() { return head != tail; }); + std::unique_lock lock_thrd(mtx); + cv.wait(lock_thrd, [this]() { return head != tail; }); cur = entries[head]; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f1..b17d6bc57 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -62,7 +62,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(n_kv_max, 0, 1); // decode in batches of ctx_params.n_batch tokens - auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { + auto decode_helper = [&ctx, &batch](int32_t n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { common_batch_add(batch, 0, i, { 0 }, false); } - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -134,7 +134,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -156,7 +156,7 @@ int main(int argc, char ** argv) { common_batch_add(batch, 0, pp + i, { j }, true); } - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index dc827e814..2e8812f03 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2082,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } else if (ctx->has_qwen2vl_merger) { clip_image_u8 * resized = clip_image_u8_init(); - auto patch_size = clip_patch_size(ctx) * 2; + auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size; bicubic_resize(*img, *resized, nx, ny); @@ -2293,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } -int32_t clip_image_size(const struct clip_ctx * ctx) { +int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } -int32_t clip_patch_size(const struct clip_ctx * ctx) { +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.patch_size; } -int32_t clip_hidden_size(const struct clip_ctx * ctx) { +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.hidden_size; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 1603edd26..3b60f161d 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); -CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx); // TODO: should be enum, not string CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index c598caf3d..1978ce180 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_context * ctx; } model; - const int32_t image_size = clip_image_size(ctx_clip); - const int32_t patch_size = clip_patch_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); + const int32_t patch_size = clip_get_patch_size(ctx_clip); int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) @@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; - const int32_t image_size = clip_image_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0c0f066ca..ab8d6c6b4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { json res_json = result->to_json(); if (res_json.is_array()) { - for (const auto & res : res_json) { - if (!server_sent_event(sink, "data", res)) { + for (const auto & item : res_json) { + if (!server_sent_event(sink, "data", item)) { return false; } } @@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) { std::unordered_set task_ids = server_task::get_list_id(tasks); ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); + for (auto & result : results) { + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + responses.push_back(result->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) { std::unordered_set task_ids = server_task::get_list_id(tasks); ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); + for (auto & result : results) { + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + responses.push_back(result->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index e8eda9c22..2b2d906e5 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -110,9 +110,8 @@ int main(int argc, char ** argv) { llama_token new_token_id; while (true) { // check if we have enough space in the context to evaluate this batch - int n_ctx = llama_n_ctx(ctx); int n_ctx_used = llama_get_kv_cache_used_cells(ctx); - if (n_ctx_used + batch.n_tokens > n_ctx) { + if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); exit(0); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 01a3afa40..9026fbcf5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); if (ggml_backend_split_buffer_type_fn) { size_t dev_index = [&]() { - auto * reg = ggml_backend_dev_backend_reg(dev); - for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) { - if (ggml_backend_reg_dev_get(reg, i) == dev) { + ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev); + for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) { + if (ggml_backend_reg_dev_get(reg_dev, i) == dev) { return i; } } @@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { - return {cpu_dev, &pimpl->cpu_buft_list}; + return { cpu_dev, &pimpl->cpu_buft_list }; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); @@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // avoid using a host buffer when using mmap auto * buft_dev = ggml_backend_buft_get_device(buft); if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); buft = ggml_backend_dev_buffer_type(cpu_dev); } @@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { const struct ggml_tensor * llama_model::get_tensor(const char * name) const { auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; + [name](const std::pair & entry) { + return entry.first == name; }); if (it == tensors_by_name.end()) { return nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 75899d142..c1e751e70 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; }; const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { + auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) { if (n_expert > 1) { // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. - if (sscanf(name, "blk.%d.", &i_layer) != 1) { - throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); + if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer)); } if (i_layer < 0 || i_layer >= n_layer) { - throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer)); } } return std::make_pair(i_layer, n_layer); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ef108b991..b03f40485 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t // copy piece chars to output text buffer // skip up to 'lstrip' leading spaces before copying - auto _try_copy = [=] (const char * token, size_t size) -> int32_t { - for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { - token++; + auto _try_copy = [=] (const char * text, size_t size) -> int32_t { + for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) { + text++; size--; } if (length < (int32_t)size) { return -(int32_t) size; } - memcpy(buf, token, size); + memcpy(buf, text, size); return (int32_t) size; };