From 921772104ba2219bfdc2b2980d05ebc0aa0c92a4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 5 Sep 2023 08:46:17 +0300 Subject: [PATCH] speculative : add grammar support (#2991) * speculative : add grammar support * grammars : add json_arr.gbnf * grammar : add comments to new grammar file * grammar : remove one nested level * common : warm-up with 2 tokens - seems to work better * speculative : print draft token pieces * speculative : reuse grammar parser + better logs and comments * speculative : avoid grammar_mem * make : fix speculative build --- Makefile | 2 +- common/common.cpp | 2 +- examples/speculative/speculative.cpp | 80 ++++++++++++++++++++++++---- grammars/json_arr.gbnf | 34 ++++++++++++ llama.cpp | 19 +++++++ llama.h | 2 + 6 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 grammars/json_arr.gbnf diff --git a/Makefile b/Makefile index 847aa3a85..139fa02a8 100644 --- a/Makefile +++ b/Makefile @@ -495,7 +495,7 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS) +speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifdef LLAMA_METAL diff --git a/common/common.cpp b/common/common.cpp index 74e1b6fd2..d4f9dbf55 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -772,7 +772,7 @@ std::tuple llama_init_from_gpt_par { LOG("warming up the model with an empty run\n"); - const std::vector tmp = { llama_token_bos(lctx), }; + const std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_reset_timings(lctx); } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index f0400c13f..c6211ac79 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -6,6 +6,7 @@ #include "common.h" #include "llama.h" +#include "grammar-parser.h" #include #include @@ -109,16 +110,35 @@ int main(int argc, char ** argv) { // used to determine end of generation bool has_eos = false; + // grammar stuff + struct llama_grammar * grammar_dft = NULL; + struct llama_grammar * grammar_tgt = NULL; + + grammar_parser::parse_state parsed_grammar; + + // if requested - load the grammar, error checking is omitted for brevity + if (!params.grammar.empty()) { + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + const auto t_dec_start = ggml_time_us(); while (true) { LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); - // sample from the drafted tokens if any int i_dft = 0; while (true) { - const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft); + // sample from the target model + const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); + // remember which tokens were sampled - used for repetition penalties during sampling last_tokens.erase(last_tokens.begin()); last_tokens.push_back(id); @@ -134,8 +154,9 @@ int main(int argc, char ** argv) { ++n_predict; + // check if the draft matches the target if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { - LOG("drafted token %d accepted\n", id); + LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); ++n_accept; ++n_past_tgt; ++n_past_dft; @@ -145,6 +166,14 @@ int main(int argc, char ** argv) { } // the drafted token was rejected or we are out of drafted tokens + + if (i_dft < (int) drafted.size()) { + LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n", + i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str()); + } else { + LOG("out of drafted tokens\n"); + } + llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); ++n_past_dft; @@ -158,7 +187,16 @@ int main(int argc, char ** argv) { break; } - // sample n_draft tokens from the draft model picking the best token + if (grammar_tgt) { + if (grammar_dft) { + llama_grammar_free(grammar_dft); + } + grammar_dft = llama_grammar_copy(grammar_tgt); + + LOG("copied target grammar to draft grammar\n"); + } + + // sample n_draft tokens from the draft model using greedy decoding int n_past_cur = n_past_dft; for (int i = 0; i < n_draft; ++i) { float * logits = llama_get_logits(ctx_dft); @@ -170,25 +208,40 @@ int main(int argc, char ** argv) { llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; + if (grammar_dft != NULL) { + llama_sample_grammar(ctx_dft, &cur_p, grammar_dft); + } + // computes softmax and sorts the candidates llama_sample_softmax(ctx_dft, &cur_p); for (int i = 0; i < 3; ++i) { - LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p); + LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str()); } - // too low probability, stop drafting + // TODO: better logic? if (cur_p.data[0].p < 2*cur_p.data[1].p) { + LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p); break; } - drafted.push_back(cur_p.data[0].id); + // drafted token + const llama_token id = cur_p.data[0].id; + + drafted.push_back(id); ++n_drafted; - if (i < n_draft - 1) { - // evaluate the drafted token on the draft model - llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); - ++n_past_cur; + // no need to evaluate the last drafted token, since we won't use the result + if (i == n_draft - 1) { + break; + } + + // evaluate the drafted token on the draft model + llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); + ++n_past_cur; + + if (grammar_dft != NULL) { + llama_grammar_accept_token(ctx_dft, grammar_dft, id); } } @@ -196,6 +249,7 @@ int main(int argc, char ** argv) { llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); ++n_past_tgt; + // the first token is always proposed by the traget model before the speculation loop drafted.erase(drafted.begin()); } @@ -226,6 +280,10 @@ int main(int argc, char ** argv) { llama_free(ctx_dft); llama_free_model(model_dft); + if (grammar_dft != NULL) { + llama_grammar_free(grammar_dft); + llama_grammar_free(grammar_tgt); + } llama_backend_free(); fprintf(stderr, "\n\n"); diff --git a/grammars/json_arr.gbnf b/grammars/json_arr.gbnf new file mode 100644 index 000000000..ef53e77a0 --- /dev/null +++ b/grammars/json_arr.gbnf @@ -0,0 +1,34 @@ +# This is the same as json.gbnf but we restrict whitespaces at the end of the root array +# Useful for generating JSON arrays + +root ::= arr +value ::= object | array | string | number | ("true" | "false" | "null") ws + +arr ::= + "[\n" ws ( + value + (",\n" ws value)* + )? "]" + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +string ::= + "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + )* "\"" ws + +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= ([ \t\n] ws)? diff --git a/llama.cpp b/llama.cpp index b9485df0c..edf3b4eaf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3850,6 +3850,25 @@ void llama_grammar_free(struct llama_grammar * grammar) { delete grammar; } +struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { + llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 }; + + // redirect elements in stacks to point to new rules + for (size_t is = 0; is < result->stacks.size(); is++) { + for (size_t ie = 0; ie < result->stacks[is].size(); ie++) { + for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) { + for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) { + if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) { + result->stacks[is][ie] = &result->rules[ir0][ir1]; + } + } + } + } + } + + return result; +} + // // sampling // diff --git a/llama.h b/llama.h index 422f28527..5b95aaa87 100644 --- a/llama.h +++ b/llama.h @@ -410,6 +410,8 @@ extern "C" { LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); + LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar); + // // Sampling functions //