mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
common : add missing env var for speculative (#10801)
This commit is contained in:
parent
5555c0c1f6
commit
9fdb124304
@ -2083,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_max = value;
|
params.speculative.n_max = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-min", "--draft-n-min"}, "N",
|
{"--draft-min", "--draft-n-min"}, "N",
|
||||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_min = value;
|
params.speculative.n_min = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-split"}, "P",
|
{"--draft-p-split"}, "P",
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_split = std::stof(value);
|
params.speculative.p_split = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-min"}, "P",
|
{"--draft-p-min"}, "P",
|
||||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_min = std::stof(value);
|
params.speculative.p_min = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cd", "--ctx-size-draft"}, "N",
|
{"-cd", "--ctx-size-draft"}, "N",
|
||||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_ctx = value;
|
params.speculative.n_ctx = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||||
@ -2131,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-md", "--model-draft"}, "FNAME",
|
{"-md", "--model-draft"}, "FNAME",
|
||||||
"draft model for speculative decoding (default: unused)",
|
"draft model for speculative decoding (default: unused)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.model = value;
|
params.speculative.model = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user