From afee3cfc1f4c1db7862bd0c80d02fbe8e9ca9422 Mon Sep 17 00:00:00 2001 From: Hashem Alsaket Date: Sat, 24 Jun 2023 18:50:08 -0500 Subject: [PATCH] draft for #1776 making bos and eos available for user input instead of hard coded --- examples/common.cpp | 8 ++-- examples/common.h | 5 +- examples/embedding/embedding.cpp | 4 +- examples/main/main.cpp | 30 ++++++------ examples/perplexity/perplexity.cpp | 6 +-- examples/server/server.cpp | 8 ++-- examples/simple/simple.cpp | 6 +-- .../train-text-from-scratch.cpp | 15 +++--- llama.cpp | 48 +++++++++++-------- llama.h | 13 +++-- 10 files changed, 80 insertions(+), 63 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 6ac484555..cb8026cee 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -356,7 +356,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { - params.logit_bias[llama_token_eos()] = -INFINITY; + params.logit_bias[params.eos_token] = -INFINITY; } else if (arg == "--no-penalize-nl") { params.penalize_nl = false; } else if (arg == "-l" || arg == "--logit-bias") { @@ -526,10 +526,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) { } // TODO: not great allocating this every time -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos) { // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int) add_bos); - const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + std::vector res(text.size() + (int) add_bos + (int) add_eos); + const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos, add_eos); assert(n >= 0); res.resize(n); diff --git a/examples/common.h b/examples/common.h index 713320179..3e32a6a52 100644 --- a/examples/common.h +++ b/examples/common.h @@ -78,6 +78,9 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation + + int bos_token = 1; // beginning of sentence token + int eos_token = 2; // end of sentence token }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); @@ -90,7 +93,7 @@ std::string gpt_random_prompt(std::mt19937 & rng); // Vocab utils // -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos); // // Model utils diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 369eac1d1..9dec9d83d 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -60,7 +60,7 @@ int main(int argc, char ** argv) { params.prompt.insert(0, 1, ' '); // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); if (params.verbose_prompt) { fprintf(stderr, "\n"); @@ -74,7 +74,7 @@ int main(int argc, char ** argv) { if (params.embedding){ if (embd_inp.size() > 0) { - if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads, params.bos_token, params.eos_token)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c1e6bf126..9fba9a9bb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -129,13 +129,13 @@ int main(int argc, char ** argv) { // uncomment the "used_mem" line in llama.cpp to see the results if (params.mem_test) { { - const std::vector tmp(params.n_batch, llama_token_bos()); - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + const std::vector tmp(params.n_batch, params.bos_token); + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token); } { const std::vector tmp = { 0, }; - llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads, params.bos_token, params.eos_token); } llama_print_timings(ctx); @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { // export the cgraph and exit if (params.export_cgraph) { - llama_eval_export(ctx, "llama.ggml"); + llama_eval_export(ctx, "llama.ggml", params.bos_token, params.eos_token); llama_free(ctx); llama_free_model(model); @@ -187,7 +187,7 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); - embd_inp = ::llama_tokenize(ctx, params.prompt, true); + embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { embd_inp = session_tokens; } @@ -234,8 +234,8 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, false); // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { @@ -249,7 +249,7 @@ int main(int argc, char ** argv) { } // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false, false); if (params.verbose_prompt) { fprintf(stderr, "\n"); @@ -342,8 +342,8 @@ int main(int argc, char ** argv) { // do one empty run to warm up the model { - const std::vector tmp = { llama_token_bos(), }; - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + const std::vector tmp = { params.bos_token, }; + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token); llama_reset_timings(ctx); } @@ -417,7 +417,7 @@ int main(int argc, char ** argv) { if (n_eval > params.n_batch) { n_eval = params.n_batch; } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads, params.bos_token, params.eos_token)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } @@ -516,11 +516,11 @@ int main(int argc, char ** argv) { } // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive && !params.instruct) { + if (id == params.eos_token && params.interactive && !params.instruct) { id = llama_token_newline.front(); if (params.antiprompt.size() != 0) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, false); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); } } @@ -626,7 +626,7 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } - auto line_inp = ::llama_tokenize(ctx, buffer, false); + auto line_inp = ::llama_tokenize(ctx, buffer, false, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); // instruct mode: insert response suffix @@ -646,7 +646,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos()) { + if (!embd.empty() && embd.back() == params.eos_token) { if (params.instruct) { is_interacting = true; } else { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b59f5971e..5e96b40d2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -30,7 +30,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - auto tokens = ::llama_tokenize(ctx, params.prompt, true); + auto tokens = ::llama_tokenize(ctx, params.prompt, true, true); int count = 0; @@ -60,10 +60,10 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // add BOS token for the first batch of each chunk if (j == 0) { - tokens[batch_start] = llama_token_bos(); + tokens[batch_start] = params.bos_token; } - if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads, params.bos_token, params.eos_token)) { fprintf(stderr, "%s : failed to eval\n", __func__); return; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de22d3013..fe32030cd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -261,7 +261,7 @@ struct llama_server_context { if (params.n_predict == 0) { has_next_token = false; - return llama_token_eos(); + return params.eos_token; } // out of user input, sample next token @@ -344,7 +344,7 @@ struct llama_server_context { // decrement remaining sampling budget --n_remain; - if (!embd.empty() && embd.back() == llama_token_eos()) { + if (!embd.empty() && embd.back() == params.eos_token) { //stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; stopped_eos = true; @@ -644,7 +644,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } static json format_generation_settings(llama_server_context & llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const auto eos_bias = llama.params.logit_bias.find(llama.params.eos_token); const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -731,7 +731,7 @@ static void parse_options_completion(const json & body, llama_server_context & l llama.params.logit_bias.clear(); if (body.value("ignore_eos", false)) { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; + llama.params.logit_bias[default_params.eos_token] = -INFINITY; } const auto & logit_bias = body.find("logit_bias"); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index fc45c9340..f16f6ddb1 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -84,7 +84,7 @@ int main(int argc, char ** argv) //--------------------------------- std::vector tokens_list; - tokens_list = ::llama_tokenize( ctx , params.prompt , true ); + tokens_list = ::llama_tokenize( ctx , params.prompt , true, true ); const int max_context_size = llama_n_ctx( ctx ); const int max_tokens_list_size = max_context_size - 4 ; @@ -123,7 +123,7 @@ int main(int argc, char ** argv) // Evaluate the tokens : //--------------------------------- - if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads, params.bos_token, params.eos_token ) ) { fprintf( stderr, "%s : failed to eval\n" , __func__ ); return 1; @@ -155,7 +155,7 @@ int main(int argc, char ** argv) // is it an end of stream ? - if ( new_token_id == llama_token_eos() ) + if ( new_token_id == params.eos_token ) { fprintf(stderr, " [end of text]\n"); break; diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 61c829e5c..fac811079 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -2003,7 +2003,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } } -void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { +void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, llama_token bos_token = 1) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -2012,7 +2012,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_probs, 0.0f); - ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); + ggml_set_i32_1d(tokens_input, 0, bos_token); for (int i=1; in_dims == 2); GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3); @@ -2043,7 +2043,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples]; GGML_ASSERT(sample+n_tokens-1 < n_train_data); - set_i32_2d(tokens_input, 0, k, llama_token_bos()); + set_i32_2d(tokens_input, 0, k, bos_token); for (int i=1; i= 0) { out.resize(n_tokens); } @@ -2698,6 +2698,9 @@ struct train_params { int print_info_interval; int print_details_interval; + int bos_token; + int eos_token; + bool samples_start_after_nl; bool use_adam; bool use_flash; @@ -3231,7 +3234,7 @@ int main(int argc, char ** argv) { gf->n_threads = params.n_threads; gb->n_threads = params.n_threads; - get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs, params.bos_token, params.eos_token); GGML_ASSERT(n_past == 0); diff --git a/llama.cpp b/llama.cpp index ac22a48f8..501e24c46 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1373,14 +1373,22 @@ static bool llama_eval_internal( const int n_tokens, const int n_past, const int n_threads, - const char * cgraph_fname) { + const char * cgraph_fname, + int bos_token, + int eos_token) { // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { + if (n_past == 0 && tokens[0] != bos_token) { fprintf(stderr, "%s: first token must be BOS\n", __func__); return false; } + // enforce that the last token is EOS + // if (n_past == 0 && tokens[-1] != eos_token) { + // fprintf(stderr, "%s: last token must be EOS\n", __func__); + // return false; + // } + const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; @@ -1925,7 +1933,7 @@ private: llama_sp_bigram::queue work_queue_; }; -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { +static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, int bos_token, int eos_token) { llama_tokenizer tokenizer(vocab); std::vector output; @@ -1933,11 +1941,16 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co return output; } - if (bos) { - output.push_back(llama_token_bos()); + if (bos_token != 0) { + output.push_back(bos_token); } tokenizer.tokenize(text, output); + + if (eos_token != 0) { + output.push_back(eos_token); + } + return output; } @@ -3407,8 +3420,10 @@ int llama_eval( const llama_token * tokens, int n_tokens, int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) { + int n_threads, + int bos_token, + int eos_token) { + if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr, bos_token, eos_token)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3423,13 +3438,13 @@ int llama_eval( return 0; } -int llama_eval_export(struct llama_context * ctx, const char * fname) { +int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token = 1, int eos_token = 2) { const int n_batch = 1; const int n_ctx = 512 - n_batch; - const std::vector tmp(n_batch, llama_token_bos()); + const std::vector tmp(n_batch, bos_token); - if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) { + if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname, bos_token, eos_token)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3442,8 +3457,9 @@ int llama_tokenize( const char * text, llama_token * tokens, int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + bool add_bos, + bool add_eos) { + auto res = llama_tokenize(ctx->vocab, text, add_bos, add_eos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -3498,14 +3514,6 @@ const char * llama_token_to_str(const struct llama_context * ctx, llama_token to return ctx->vocab.id_to_token[token].tok.c_str(); } -llama_token llama_token_bos() { - return 1; -} - -llama_token llama_token_eos() { - return 2; -} - llama_token llama_token_nl() { return 13; } diff --git a/llama.h b/llama.h index a833a7f4d..4b6edd1d1 100644 --- a/llama.h +++ b/llama.h @@ -223,13 +223,15 @@ extern "C" { const llama_token * tokens, int n_tokens, int n_past, - int n_threads); + int n_threads, + int bos_token, + int eos_token); // Export a static computation graph for context of 511 and batch size of 1 // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these // parameters here to keep things simple // IMPORTANT: do not use for anything else other than debugging and testing! - LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); + LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token, int eos_token); // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens. @@ -241,7 +243,8 @@ extern "C" { const char * text, llama_token * tokens, int n_max_tokens, - bool add_bos); + bool add_bos, + bool add_eos); LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); @@ -270,8 +273,8 @@ extern "C" { LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); // Special tokens - LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(); // end-of-sentence +// LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence +// LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_nl(); // next-line // Sampling functions