mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-29 04:44:34 +00:00
speculative : add --draft-min CLI arg
Some checks are pending
flake8 Lint / Lint (push) Waiting to run
Some checks are pending
flake8 Lint / Lint (push) Waiting to run
This commit is contained in:
parent
0d4d0c1559
commit
f27ddc57d7
@ -609,7 +609,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_draft = value;
|
params.n_draft = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-min"}, "N",
|
||||||
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.n_draft_min),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.n_draft_min = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ps", "--p-split"}, "N",
|
{"-ps", "--p-split"}, "N",
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
||||||
@ -1454,7 +1461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||||
"how to split the model across multiple GPUs, one of:\n"
|
"how to split the model across multiple GPUs, one of:\n"
|
||||||
@ -1599,7 +1606,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.model_draft = value;
|
params.model_draft = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mu", "--model-url"}, "MODEL_URL",
|
{"-mu", "--model-url"}, "MODEL_URL",
|
||||||
"model download url (default: unused)",
|
"model download url (default: unused)",
|
||||||
|
@ -162,6 +162,7 @@ struct common_params {
|
|||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||||
|
int32_t n_draft_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
@ -13,9 +13,6 @@
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
common_params params;
|
common_params params;
|
||||||
|
|
||||||
// minimum size of the draft to use
|
|
||||||
const int n_min = 5;
|
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -142,7 +139,7 @@ int main(int argc, char ** argv) {
|
|||||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||||
{
|
{
|
||||||
// do not waste time on small drafts
|
// do not waste time on small drafts
|
||||||
if (draft.size() < n_min) {
|
if (draft.size() < params.n_draft_min) {
|
||||||
draft.clear();
|
draft.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user