mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 06:14:35 +00:00
common : refactor args
ggml-ci
This commit is contained in:
parent
c8880e786c
commit
7f9cc2058c
445
common/arg.cpp
445
common/arg.cpp
@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
|
||||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
|
||||||
|
|
||||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
for (auto & antiprompt : params.antiprompt) {
|
for (auto & antiprompt : params.antiprompt) {
|
||||||
string_process_escapes(antiprompt);
|
string_process_escapes(antiprompt);
|
||||||
}
|
}
|
||||||
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
|
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||||
string_process_escapes(seq_breaker);
|
string_process_escapes(seq_breaker);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -329,7 +330,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
|
|
||||||
std::string sampler_type_chars;
|
std::string sampler_type_chars;
|
||||||
std::string sampler_type_names;
|
std::string sampler_type_names;
|
||||||
for (const auto & sampler : params.sparams.samplers) {
|
for (const auto & sampler : params.sampling.samplers) {
|
||||||
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
||||||
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
||||||
}
|
}
|
||||||
@ -407,26 +408,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-td", "--threads-draft"}, "N",
|
|
||||||
"number of threads to use during generation (default: same as --threads)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.n_threads = value;
|
|
||||||
if (params.draft_cpuparams.n_threads <= 0) {
|
|
||||||
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-tbd", "--threads-batch-draft"}, "N",
|
|
||||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.n_threads = value;
|
|
||||||
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
|
||||||
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-C", "--cpu-mask"}, "M",
|
{"-C", "--cpu-mask"}, "M",
|
||||||
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
||||||
@ -515,115 +496,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.cpuparams_batch.poll = value;
|
params.cpuparams_batch.poll = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
||||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
||||||
[](common_params & params, const std::string & mask) {
|
|
||||||
params.draft_cpuparams.mask_valid = true;
|
|
||||||
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
||||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
||||||
[](common_params & params, const std::string & range) {
|
|
||||||
params.draft_cpuparams.mask_valid = true;
|
|
||||||
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid range");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--cpu-strict-draft"}, "<0|1>",
|
|
||||||
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.strict_cpu = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--prio-draft"}, "N",
|
|
||||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
|
||||||
[](common_params & params, int prio) {
|
|
||||||
if (prio < 0 || prio > 3) {
|
|
||||||
throw std::invalid_argument("invalid value");
|
|
||||||
}
|
|
||||||
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--poll-draft"}, "<0|1>",
|
|
||||||
"Use polling to wait for draft model work (default: same as --poll])",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.poll = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
||||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
||||||
[](common_params & params, const std::string & mask) {
|
|
||||||
params.draft_cpuparams_batch.mask_valid = true;
|
|
||||||
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
||||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
||||||
[](common_params & params, const std::string & range) {
|
|
||||||
params.draft_cpuparams_batch.mask_valid = true;
|
|
||||||
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
||||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.strict_cpu = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--prio-batch-draft"}, "N",
|
|
||||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
|
||||||
[](common_params & params, int prio) {
|
|
||||||
if (prio < 0 || prio > 3) {
|
|
||||||
throw std::invalid_argument("invalid value");
|
|
||||||
}
|
|
||||||
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--poll-batch-draft"}, "<0|1>",
|
|
||||||
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.poll = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--draft"}, "N",
|
|
||||||
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.n_draft = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--draft-min"}, "N",
|
|
||||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.n_draft_min),
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.n_draft_min = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-ps", "--p-split"}, "N",
|
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.p_split = std::stof(value);
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
||||||
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
||||||
@ -708,7 +580,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.no_perf = true;
|
params.no_perf = true;
|
||||||
params.sparams.no_perf = true;
|
params.sampling.no_perf = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_PERF"));
|
).set_env("LLAMA_ARG_NO_PERF"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -890,155 +762,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
const auto sampler_names = string_split<std::string>(value, ';');
|
const auto sampler_names = string_split<std::string>(value, ';');
|
||||||
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-s", "--seed"}, "SEED",
|
{"-s", "--seed"}, "SEED",
|
||||||
string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.seed = std::stoul(value);
|
params.sampling.seed = std::stoul(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--sampling-seq"}, "SEQUENCE",
|
{"--sampling-seq"}, "SEQUENCE",
|
||||||
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.samplers = common_sampler_types_from_chars(value);
|
params.sampling.samplers = common_sampler_types_from_chars(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ignore-eos"},
|
{"--ignore-eos"},
|
||||||
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.sparams.ignore_eos = true;
|
params.sampling.ignore_eos = true;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--penalize-nl"},
|
{"--penalize-nl"},
|
||||||
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.sparams.penalize_nl = true;
|
params.sampling.penalize_nl = true;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--temp"}, "N",
|
{"--temp"}, "N",
|
||||||
string_format("temperature (default: %.1f)", (double)params.sparams.temp),
|
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.temp = std::stof(value);
|
params.sampling.temp = std::stof(value);
|
||||||
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-k"}, "N",
|
{"--top-k"}, "N",
|
||||||
string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.top_k = value;
|
params.sampling.top_k = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-p"}, "N",
|
{"--top-p"}, "N",
|
||||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.top_p = std::stof(value);
|
params.sampling.top_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--min-p"}, "N",
|
{"--min-p"}, "N",
|
||||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.min_p = std::stof(value);
|
params.sampling.min_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--xtc-probability"}, "N",
|
{"--xtc-probability"}, "N",
|
||||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.xtc_probability = std::stof(value);
|
params.sampling.xtc_probability = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--xtc-threshold"}, "N",
|
{"--xtc-threshold"}, "N",
|
||||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.xtc_threshold = std::stof(value);
|
params.sampling.xtc_threshold = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--typical"}, "N",
|
{"--typical"}, "N",
|
||||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.typ_p = std::stof(value);
|
params.sampling.typ_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-last-n"}, "N",
|
{"--repeat-last-n"}, "N",
|
||||||
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.penalty_last_n = value;
|
params.sampling.penalty_last_n = value;
|
||||||
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-penalty"}, "N",
|
{"--repeat-penalty"}, "N",
|
||||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_repeat = std::stof(value);
|
params.sampling.penalty_repeat = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--presence-penalty"}, "N",
|
{"--presence-penalty"}, "N",
|
||||||
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_present = std::stof(value);
|
params.sampling.penalty_present = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--frequency-penalty"}, "N",
|
{"--frequency-penalty"}, "N",
|
||||||
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_freq = std::stof(value);
|
params.sampling.penalty_freq = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-multiplier"}, "N",
|
{"--dry-multiplier"}, "N",
|
||||||
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
|
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dry_multiplier = std::stof(value);
|
params.sampling.dry_multiplier = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-base"}, "N",
|
{"--dry-base"}, "N",
|
||||||
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
|
string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
float potential_base = std::stof(value);
|
float potential_base = std::stof(value);
|
||||||
if (potential_base >= 1.0f)
|
if (potential_base >= 1.0f)
|
||||||
{
|
{
|
||||||
params.sparams.dry_base = potential_base;
|
params.sampling.dry_base = potential_base;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-allowed-length"}, "N",
|
{"--dry-allowed-length"}, "N",
|
||||||
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
|
string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.dry_allowed_length = value;
|
params.sampling.dry_allowed_length = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-penalty-last-n"}, "N",
|
{"--dry-penalty-last-n"}, "N",
|
||||||
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
|
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.dry_penalty_last_n = value;
|
params.sampling.dry_penalty_last_n = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-sequence-breaker"}, "STRING",
|
{"--dry-sequence-breaker"}, "STRING",
|
||||||
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
||||||
params.sparams.dry_sequence_breakers.empty() ? "none" :
|
params.sampling.dry_sequence_breakers.empty() ? "none" :
|
||||||
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
|
std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
|
||||||
params.sparams.dry_sequence_breakers.end(),
|
params.sampling.dry_sequence_breakers.end(),
|
||||||
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
|
std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
|
||||||
[](const std::string& a, const std::string& b) {
|
[](const std::string& a, const std::string& b) {
|
||||||
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
||||||
return a + ", '" + formatted_b + "'";
|
return a + ", '" + formatted_b + "'";
|
||||||
@ -1047,51 +919,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
static bool defaults_cleared = false;
|
static bool defaults_cleared = false;
|
||||||
|
|
||||||
if (!defaults_cleared) {
|
if (!defaults_cleared) {
|
||||||
params.sparams.dry_sequence_breakers.clear();
|
params.sampling.dry_sequence_breakers.clear();
|
||||||
defaults_cleared = true;
|
defaults_cleared = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value == "none") {
|
if (value == "none") {
|
||||||
params.sparams.dry_sequence_breakers.clear();
|
params.sampling.dry_sequence_breakers.clear();
|
||||||
} else {
|
} else {
|
||||||
params.sparams.dry_sequence_breakers.emplace_back(value);
|
params.sampling.dry_sequence_breakers.emplace_back(value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-range"}, "N",
|
{"--dynatemp-range"}, "N",
|
||||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_range = std::stof(value);
|
params.sampling.dynatemp_range = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-exp"}, "N",
|
{"--dynatemp-exp"}, "N",
|
||||||
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_exponent = std::stof(value);
|
params.sampling.dynatemp_exponent = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat"}, "N",
|
{"--mirostat"}, "N",
|
||||||
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
||||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.mirostat = value;
|
params.sampling.mirostat = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-lr"}, "N",
|
{"--mirostat-lr"}, "N",
|
||||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_eta = std::stof(value);
|
params.sampling.mirostat_eta = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-ent"}, "N",
|
{"--mirostat-ent"}, "N",
|
||||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_tau = std::stof(value);
|
params.sampling.mirostat_tau = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -1107,7 +979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
try {
|
try {
|
||||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||||
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||||
params.sparams.logit_bias.push_back({key, bias});
|
params.sampling.logit_bias.push_back({key, bias});
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("invalid input format");
|
throw std::invalid_argument("invalid input format");
|
||||||
}
|
}
|
||||||
@ -1118,9 +990,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--grammar"}, "GRAMMAR",
|
{"--grammar"}, "GRAMMAR",
|
||||||
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.grammar = value;
|
params.sampling.grammar = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -1134,7 +1006,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
std::copy(
|
std::copy(
|
||||||
std::istreambuf_iterator<char>(file),
|
std::istreambuf_iterator<char>(file),
|
||||||
std::istreambuf_iterator<char>(),
|
std::istreambuf_iterator<char>(),
|
||||||
std::back_inserter(params.sparams.grammar)
|
std::back_inserter(params.sampling.grammar)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
@ -1142,7 +1014,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
{"-j", "--json-schema"}, "SCHEMA",
|
{"-j", "--json-schema"}, "SCHEMA",
|
||||||
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -1451,17 +1323,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
||||||
"number of layers to store in VRAM for the draft model",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.n_gpu_layers_draft = value;
|
|
||||||
if (!llama_supports_gpu_offload()) {
|
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||||
"how to split the model across multiple GPUs, one of:\n"
|
"how to split the model across multiple GPUs, one of:\n"
|
||||||
@ -1600,13 +1461,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.model = value;
|
params.model = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-md", "--model-draft"}, "FNAME",
|
|
||||||
"draft model for speculative decoding (default: unused)",
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.model_draft = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mu", "--model-url"}, "MODEL_URL",
|
{"-mu", "--model-url"}, "MODEL_URL",
|
||||||
"model download url (default: unused)",
|
"model download url (default: unused)",
|
||||||
@ -2044,5 +1898,168 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
}
|
}
|
||||||
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
||||||
|
|
||||||
|
// speculative parameters
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-td", "--threads-draft"}, "N",
|
||||||
|
"number of threads to use during generation (default: same as --threads)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.n_threads = value;
|
||||||
|
if (params.speculative.cpuparams.n_threads <= 0) {
|
||||||
|
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-tbd", "--threads-batch-draft"}, "N",
|
||||||
|
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.n_threads = value;
|
||||||
|
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
||||||
|
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||||
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||||
|
[](common_params & params, const std::string & mask) {
|
||||||
|
params.speculative.cpuparams.mask_valid = true;
|
||||||
|
if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
||||||
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
||||||
|
[](common_params & params, const std::string & range) {
|
||||||
|
params.speculative.cpuparams.mask_valid = true;
|
||||||
|
if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid range");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cpu-strict-draft"}, "<0|1>",
|
||||||
|
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.strict_cpu = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--prio-draft"}, "N",
|
||||||
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
|
||||||
|
[](common_params & params, int prio) {
|
||||||
|
if (prio < 0 || prio > 3) {
|
||||||
|
throw std::invalid_argument("invalid value");
|
||||||
|
}
|
||||||
|
params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--poll-draft"}, "<0|1>",
|
||||||
|
"Use polling to wait for draft model work (default: same as --poll])",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.poll = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
||||||
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||||
|
[](common_params & params, const std::string & mask) {
|
||||||
|
params.speculative.cpuparams_batch.mask_valid = true;
|
||||||
|
if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
||||||
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
||||||
|
[](common_params & params, const std::string & range) {
|
||||||
|
params.speculative.cpuparams_batch.mask_valid = true;
|
||||||
|
if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cpu-strict-batch-draft"}, "<0|1>",
|
||||||
|
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.strict_cpu = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--prio-batch-draft"}, "N",
|
||||||
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
|
||||||
|
[](common_params & params, int prio) {
|
||||||
|
if (prio < 0 || prio > 3) {
|
||||||
|
throw std::invalid_argument("invalid value");
|
||||||
|
}
|
||||||
|
params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--poll-batch-draft"}, "<0|1>",
|
||||||
|
"Use polling to wait for draft model work (default: --poll-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.poll = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-max", "--draft", "--draft-n"}, "N",
|
||||||
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_max = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-min", "--draft-n-min"}, "N",
|
||||||
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_min = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-p-split"}, "P",
|
||||||
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.p_split = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-p-min"}, "P",
|
||||||
|
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.p_min = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-cd", "--ctx-size-draft"}, "N",
|
||||||
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_ctx = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||||
|
"number of layers to store in VRAM for the draft model",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_gpu_layers = value;
|
||||||
|
if (!llama_supports_gpu_offload()) {
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-md", "--model-draft"}, "FNAME",
|
||||||
|
"draft model for speculative decoding (default: unused)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.model = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sparams.ignore_eos = false;
|
params.sampling.ignore_eos = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
|
@ -103,8 +103,8 @@ enum dimre_method {
|
|||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
// sampler parameters
|
// sampling parameters
|
||||||
struct common_sampler_params {
|
struct common_params_sampling {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
@ -155,20 +155,30 @@ struct common_sampler_params {
|
|||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct common_params_speculative {
|
||||||
|
int32_t n_ctx = 4096; // draft context size
|
||||||
|
int32_t n_max = 5; // maximum number of tokens to draft during speculative decoding
|
||||||
|
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||||
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
|
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
|
||||||
|
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||||
|
};
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 4096; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
|
||||||
int32_t n_draft_min = 0; // minimum number of draft tokens to use for speculative decoding
|
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
@ -185,8 +195,6 @@ struct common_params {
|
|||||||
|
|
||||||
struct cpu_params cpuparams;
|
struct cpu_params cpuparams;
|
||||||
struct cpu_params cpuparams_batch;
|
struct cpu_params cpuparams_batch;
|
||||||
struct cpu_params draft_cpuparams;
|
|
||||||
struct cpu_params draft_cpuparams_batch;
|
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
@ -198,10 +206,10 @@ struct common_params {
|
|||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct common_sampler_params sparams;
|
struct common_params_sampling sampling;
|
||||||
|
struct common_params_speculative speculative;
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
|
||||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
std::string hf_token = ""; // HF token // NOLINT
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
|
@ -99,7 +99,7 @@ struct ring_buffer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_sampler_params params;
|
common_params_sampling params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
struct llama_sampler * grmr;
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
@ -125,7 +125,7 @@ struct common_sampler {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string common_sampler_params::print() const {
|
std::string common_params_sampling::print() const {
|
||||||
char result[1024];
|
char result[1024];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
|
|||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
@ -36,7 +36,7 @@ struct common_sampler;
|
|||||||
|
|
||||||
// llama_sampler API overloads
|
// llama_sampler API overloads
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl);
|
void common_sampler_free(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
@ -29,32 +29,32 @@ struct common_speculative * common_speculative_init(
|
|||||||
// TODO: optimize or pass from outside?
|
// TODO: optimize or pass from outside?
|
||||||
#if 0
|
#if 0
|
||||||
{
|
{
|
||||||
common_sampler_params sparams;
|
common_params_sampling params;
|
||||||
sparams.no_perf = false;
|
params.no_perf = false;
|
||||||
|
|
||||||
sparams.top_k = 40;
|
params.top_k = 40;
|
||||||
sparams.top_p = 0.9;
|
params.top_p = 0.9;
|
||||||
|
|
||||||
sparams.samplers = {
|
params.samplers = {
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
COMMON_SAMPLER_TYPE_INFILL,
|
COMMON_SAMPLER_TYPE_INFILL,
|
||||||
};
|
};
|
||||||
|
|
||||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), sparams);
|
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
common_sampler_params sparams;
|
common_params_sampling params;
|
||||||
sparams.no_perf = false;
|
params.no_perf = false;
|
||||||
|
|
||||||
sparams.top_k = 10;
|
params.top_k = 10;
|
||||||
|
|
||||||
sparams.samplers = {
|
params.samplers = {
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
};
|
};
|
||||||
|
|
||||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), sparams);
|
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
@ -73,7 +73,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sampling;
|
||||||
|
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
||||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||||
if (!smpl) {
|
if (!smpl) {
|
||||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
|||||||
|
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||||
return smpl;
|
return smpl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
||||||
|
|
||||||
// verification n-grams
|
// verification n-grams
|
||||||
std::vector<ngram_data> ngrams_cur(G);
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
|
@ -21,7 +21,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.speculative.n_max;
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
@ -40,6 +40,7 @@ int main(int argc, char ** argv){
|
|||||||
common_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
common_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
common_ngram_cache ngram_cache_static;
|
common_ngram_cache ngram_cache_static;
|
||||||
|
|
||||||
int64_t t_draft_flat_us = 0;
|
int64_t t_draft_flat_us = 0;
|
||||||
int64_t t_draft_us = 0;
|
int64_t t_draft_us = 0;
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ int main(int argc, char ** argv){
|
|||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
// max. number of additional tokens to draft if match is found
|
// max. number of additional tokens to draft if match is found
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.speculative.n_max;
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
||||||
|
|
||||||
std::vector<llama_token> draft;
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sampling;
|
||||||
|
|
||||||
// save choice to use color for later
|
// save choice to use color for later
|
||||||
// (note for later: this is a slightly awkward choice)
|
// (note for later: this is a slightly awkward choice)
|
||||||
|
@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = i;
|
client.id = i;
|
||||||
client.smpl = common_sampler_init(model, params.sparams);
|
client.smpl = common_sampler_init(model, params.sampling);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
|
@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
|
|||||||
return a.second > b.second;
|
return a.second > b.second;
|
||||||
});
|
});
|
||||||
|
|
||||||
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
LOG("Top %d similar chunks:\n", params.sampling.top_k);
|
||||||
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
|
||||||
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||||
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||||
LOG("similarity: %f\n", similarities[i].second);
|
LOG("similarity: %f\n", similarities[i].second);
|
||||||
|
@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
|
|||||||
common_params params;
|
common_params params;
|
||||||
|
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
params.sparams.seed = 1234;
|
params.sampling.seed = 1234;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
|
||||||
|
|
||||||
// tokenize prompt
|
// tokenize prompt
|
||||||
auto tokens = common_tokenize(ctx, params.prompt, true);
|
auto tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
|
||||||
|
|
||||||
printf("\nsecond run: %s", params.prompt.c_str());
|
printf("\nsecond run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
|
||||||
|
|
||||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
|
@ -175,7 +175,7 @@ struct server_slot {
|
|||||||
// sampling
|
// sampling
|
||||||
json json_schema;
|
json json_schema;
|
||||||
|
|
||||||
struct common_sampler_params sparams;
|
struct common_params_sampling sparams;
|
||||||
struct common_sampler * smpl = nullptr;
|
struct common_sampler * smpl = nullptr;
|
||||||
|
|
||||||
llama_token sampled;
|
llama_token sampled;
|
||||||
@ -687,7 +687,7 @@ struct server_context {
|
|||||||
|
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
slot.sparams = params.sparams;
|
slot.sparams = params.sampling;
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
queue_tasks.pop_deferred_task();
|
queue_tasks.pop_deferred_task();
|
||||||
@ -788,7 +788,7 @@ struct server_context {
|
|||||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||||
auto default_sparams = params.sparams;
|
auto default_sparams = params.sampling;
|
||||||
const auto & data = task.data;
|
const auto & data = task.data;
|
||||||
|
|
||||||
if (data.count("__oaicompat") != 0) {
|
if (data.count("__oaicompat") != 0) {
|
||||||
|
@ -1,3 +1,12 @@
|
|||||||
# llama.cpp/examples/speculative-simple
|
# llama.cpp/examples/speculative-simple
|
||||||
|
|
||||||
Demonstration of basic greedy speculative decoding
|
Demonstration of basic greedy speculative decoding
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/llama-speculative-simple \
|
||||||
|
-m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
|
||||||
|
-md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
|
||||||
|
-f test.txt -c 0 -ngl 99 --color \
|
||||||
|
--sampling-seq k --top-k 1 -fa --temp 0.0 \
|
||||||
|
-ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
|
||||||
|
```
|
||||||
|
@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
if (params.model_draft.empty()) {
|
if (params.speculative.model.empty()) {
|
||||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -46,13 +46,13 @@ int main(int argc, char ** argv) {
|
|||||||
ctx_tgt = llama_init_tgt.context;
|
ctx_tgt = llama_init_tgt.context;
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.speculative.model;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||||
if (params.draft_cpuparams.n_threads > 0) {
|
if (params.speculative.cpuparams.n_threads > 0) {
|
||||||
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
common_init_result llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model;
|
||||||
@ -66,11 +66,9 @@ int main(int argc, char ** argv) {
|
|||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
if ((int) inp.size() > llama_n_ctx(ctx_tgt)) {
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
|
||||||
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +79,10 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// how many tokens to draft each time
|
// how many tokens to draft each time
|
||||||
int n_draft = params.n_draft;
|
int n_draft = params.speculative.n_max;
|
||||||
|
int n_draft_min = params.speculative.n_min;
|
||||||
|
|
||||||
|
float p_min = params.speculative.p_min;
|
||||||
|
|
||||||
int n_predict = 0;
|
int n_predict = 0;
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
@ -97,7 +98,7 @@ int main(int argc, char ** argv) {
|
|||||||
const auto t_enc_start = ggml_time_us();
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||||
|
|
||||||
// eval the prompt
|
// eval the prompt
|
||||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||||
@ -112,9 +113,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// init the speculator
|
// init the speculator
|
||||||
struct common_speculative_params params_spec;
|
struct common_speculative_params params_spec;
|
||||||
params_spec.n_draft = n_draft;
|
params_spec.n_draft = n_draft;
|
||||||
params_spec.n_reuse = 256;
|
params_spec.n_reuse = 256;
|
||||||
params_spec.p_min = 0.9f;
|
params_spec.p_min = p_min;
|
||||||
|
|
||||||
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
||||||
|
|
||||||
@ -143,7 +144,7 @@ int main(int argc, char ** argv) {
|
|||||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||||
{
|
{
|
||||||
// do not waste time on small drafts
|
// do not waste time on small drafts
|
||||||
if (draft.size() < params.n_draft_min) {
|
if (draft.size() < n_draft_min) {
|
||||||
draft.clear();
|
draft.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ int main(int argc, char ** argv) {
|
|||||||
common_params params;
|
common_params params;
|
||||||
|
|
||||||
// needed to get candidate probs even for temp <= 0.0
|
// needed to get candidate probs even for temp <= 0.0
|
||||||
params.sparams.n_probs = 128;
|
params.sampling.n_probs = 128;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
if (params.model_draft.empty()) {
|
if (params.speculative.model.empty()) {
|
||||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -55,9 +55,9 @@ int main(int argc, char ** argv) {
|
|||||||
const int n_seq_dft = params.n_parallel;
|
const int n_seq_dft = params.n_parallel;
|
||||||
|
|
||||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
const float p_split = params.p_split;
|
const float p_draft_split = params.speculative.p_split;
|
||||||
|
|
||||||
std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
|
std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
|
||||||
std::uniform_real_distribution<> u_dist;
|
std::uniform_real_distribution<> u_dist;
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
|
|||||||
ctx_tgt = llama_init_tgt.context;
|
ctx_tgt = llama_init_tgt.context;
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.speculative.model;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||||
if (params.draft_cpuparams.n_threads > 0) {
|
if (params.speculative.cpuparams.n_threads > 0) {
|
||||||
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
common_init_result llama_init_dft = common_init_from_params(params);
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model;
|
||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context;
|
||||||
@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
|
|||||||
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
|
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
|
||||||
|
|
||||||
// how many tokens to draft each time
|
// how many tokens to draft each time
|
||||||
int n_draft = params.n_draft;
|
int n_draft = params.speculative.n_max;
|
||||||
|
|
||||||
int n_predict = 0;
|
int n_predict = 0;
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
@ -183,14 +183,14 @@ int main(int argc, char ** argv) {
|
|||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
// target model sampling context (reuse the llama_context's sampling instance)
|
// target model sampling context (reuse the llama_context's sampling instance)
|
||||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||||
|
|
||||||
// draft sequence data
|
// draft sequence data
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
// allocate llama_sampler for each draft sequence
|
// allocate llama_sampler for each draft sequence
|
||||||
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
|
drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||||
@ -230,7 +230,7 @@ int main(int argc, char ** argv) {
|
|||||||
// for stochastic sampling, attempt to match the token with the drafted tokens
|
// for stochastic sampling, attempt to match the token with the drafted tokens
|
||||||
{
|
{
|
||||||
bool accept = false;
|
bool accept = false;
|
||||||
if (params.sparams.temp > 0) {
|
if (params.sampling.temp > 0) {
|
||||||
// stochastic verification
|
// stochastic verification
|
||||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||||
|
|
||||||
@ -494,7 +494,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// attempt to split the branch if the probability is high enough
|
// attempt to split the branch if the probability is high enough
|
||||||
for (int f = 1; f < 8; ++f) {
|
for (int f = 1; f < 8; ++f) {
|
||||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
||||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||||
|
@ -96,7 +96,7 @@ int main(void) {
|
|||||||
// --draft cannot be used outside llama-speculative
|
// --draft cannot be used outside llama-speculative
|
||||||
argv = {"binary_name", "--draft", "123"};
|
argv = {"binary_name", "--draft", "123"};
|
||||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||||
assert(params.n_draft == 123);
|
assert(params.speculative.n_max == 123);
|
||||||
|
|
||||||
// skip this part on windows, because setenv is not supported
|
// skip this part on windows, because setenv is not supported
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
Loading…
Reference in New Issue
Block a user