From afee3cfc1f4c1db7862bd0c80d02fbe8e9ca9422 Mon Sep 17 00:00:00 2001
From: Hashem Alsaket <hashemalsaket@gmail.com>
Date: Sat, 24 Jun 2023 18:50:08 -0500
Subject: [PATCH] draft for #1776 making bos and eos available for user input
 instead of hard coded

---
 examples/common.cpp                           |  8 ++--
 examples/common.h                             |  5 +-
 examples/embedding/embedding.cpp              |  4 +-
 examples/main/main.cpp                        | 30 ++++++------
 examples/perplexity/perplexity.cpp            |  6 +--
 examples/server/server.cpp                    |  8 ++--
 examples/simple/simple.cpp                    |  6 +--
 .../train-text-from-scratch.cpp               | 15 +++---
 llama.cpp                                     | 48 +++++++++++--------
 llama.h                                       | 13 +++--
 10 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 6ac484555..cb8026cee 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -356,7 +356,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.logit_bias[params.eos_token] = -INFINITY;
         } else if (arg == "--no-penalize-nl") {
             params.penalize_nl = false;
         } else if (arg == "-l" || arg == "--logit-bias") {
@@ -526,10 +526,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 }
 
 // TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos) {
     // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    std::vector<llama_token> res(text.size() + (int) add_bos + (int) add_eos);
+    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos, add_eos);
     assert(n >= 0);
     res.resize(n);
 
diff --git a/examples/common.h b/examples/common.h
index 713320179..3e32a6a52 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -78,6 +78,9 @@ struct gpt_params {
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
+
+    int bos_token  = 1; // beginning of sentence token
+    int eos_token  = 2; // end of sentence token
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -90,7 +93,7 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 // Vocab utils
 //
 
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos);
 
 //
 // Model utils
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 369eac1d1..9dec9d83d 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -60,7 +60,7 @@ int main(int argc, char ** argv) {
     params.prompt.insert(0, 1, ' ');
 
     // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
 
     if (params.verbose_prompt) {
         fprintf(stderr, "\n");
@@ -74,7 +74,7 @@ int main(int argc, char ** argv) {
 
     if (params.embedding){
         if (embd_inp.size() > 0) {
-            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads, params.bos_token, params.eos_token)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return 1;
             }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c1e6bf126..9fba9a9bb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
     // uncomment the "used_mem" line in llama.cpp to see the results
     if (params.mem_test) {
         {
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
-            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+            const std::vector<llama_token> tmp(params.n_batch, params.bos_token);
+            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token);
         }
 
         {
             const std::vector<llama_token> tmp = { 0, };
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads, params.bos_token, params.eos_token);
         }
 
         llama_print_timings(ctx);
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
 
     // export the cgraph and exit
     if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
+        llama_eval_export(ctx, "llama.ggml", params.bos_token, params.eos_token);
         llama_free(ctx);
         llama_free_model(model);
 
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
         // Add a space in front of the first character to match OG llama tokenizer behavior
         params.prompt.insert(0, 1, ' ');
 
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
     } else {
         embd_inp = session_tokens;
     }
@@ -234,8 +234,8 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, false);
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -249,7 +249,7 @@ int main(int argc, char ** argv) {
     }
 
     // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false, false);
 
     if (params.verbose_prompt) {
         fprintf(stderr, "\n");
@@ -342,8 +342,8 @@ int main(int argc, char ** argv) {
 
     // do one empty run to warm up the model
     {
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        const std::vector<llama_token> tmp = { params.bos_token, };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token);
         llama_reset_timings(ctx);
     }
 
@@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
                 if (n_eval > params.n_batch) {
                     n_eval = params.n_batch;
                 }
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads, params.bos_token, params.eos_token)) {
                     fprintf(stderr, "%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -516,11 +516,11 @@ int main(int argc, char ** argv) {
             }
 
             // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
+            if (id == params.eos_token && params.interactive && !params.instruct) {
                 id = llama_token_newline.front();
                 if (params.antiprompt.size() != 0) {
                     // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, false);
                     embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                 }
             }
@@ -626,7 +626,7 @@ int main(int argc, char ** argv) {
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
 
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     // instruct mode: insert response suffix
@@ -646,7 +646,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
+        if (!embd.empty() && embd.back() == params.eos_token) {
             if (params.instruct) {
                 is_interacting = true;
             } else {
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b59f5971e..5e96b40d2 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -30,7 +30,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+    auto tokens = ::llama_tokenize(ctx, params.prompt, true, true);
 
     int count   = 0;
 
@@ -60,10 +60,10 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
 
             // add BOS token for the first batch of each chunk
             if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
+                tokens[batch_start] = params.bos_token;
             }
 
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads, params.bos_token, params.eos_token)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return;
             }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index de22d3013..fe32030cd 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -261,7 +261,7 @@ struct llama_server_context {
 
         if (params.n_predict == 0) {
             has_next_token = false;
-            return llama_token_eos();
+            return params.eos_token;
         }
 
         // out of user input, sample next token
@@ -344,7 +344,7 @@ struct llama_server_context {
         // decrement remaining sampling budget
         --n_remain;
 
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
+        if (!embd.empty() && embd.back() == params.eos_token) {
             //stopping_word = llama_token_to_str(ctx, embd.back());
             has_next_token = false;
             stopped_eos = true;
@@ -644,7 +644,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
 }
 
 static json format_generation_settings(llama_server_context & llama) {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama.params.eos_token);
     const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
         eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
@@ -731,7 +731,7 @@ static void parse_options_completion(const json & body, llama_server_context & l
 
     llama.params.logit_bias.clear();
     if (body.value("ignore_eos", false)) {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[default_params.eos_token] = -INFINITY;
     }
 
     const auto & logit_bias = body.find("logit_bias");
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index fc45c9340..f16f6ddb1 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -84,7 +84,7 @@ int main(int argc, char ** argv)
     //---------------------------------
 
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize( ctx , params.prompt , true, true );
 
     const int max_context_size     = llama_n_ctx( ctx );
     const int max_tokens_list_size = max_context_size - 4 ;
@@ -123,7 +123,7 @@ int main(int argc, char ** argv)
         // Evaluate the tokens :
         //---------------------------------
 
-        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads, params.bos_token, params.eos_token ) )
         {
             fprintf( stderr,  "%s : failed to eval\n" , __func__ );
             return 1;
@@ -155,7 +155,7 @@ int main(int argc, char ** argv)
 
 
         // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
+        if ( new_token_id == params.eos_token )
         {
             fprintf(stderr, " [end of text]\n");
             break;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 61c829e5c..fac811079 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2003,7 +2003,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
     }
 }
 
-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, llama_token bos_token = 1) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
 
@@ -2012,7 +2012,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
 
     ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, bos_token);
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
         set_f32_2d(target_logits, token, i-1, +1.0f);
@@ -2023,7 +2023,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int bos_token, int eos_token) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2043,7 +2043,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
         size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, bos_token);
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
             // print_token(lctx, token);
@@ -2198,7 +2198,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
 
     out.resize(buf.size());
 
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false, false);
     if (n_tokens >= 0) {
         out.resize(n_tokens);
     }
@@ -2698,6 +2698,9 @@ struct train_params {
     int print_info_interval;
     int print_details_interval;
 
+    int bos_token;
+    int eos_token;
+
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
@@ -3231,7 +3234,7 @@ int main(int argc, char ** argv) {
         gf->n_threads = params.n_threads;
         gb->n_threads = params.n_threads;
 
-        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
+        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs, params.bos_token, params.eos_token);
 
         GGML_ASSERT(n_past == 0);
 
diff --git a/llama.cpp b/llama.cpp
index ac22a48f8..501e24c46 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1373,14 +1373,22 @@ static bool llama_eval_internal(
             const int    n_tokens,
             const int    n_past,
             const int    n_threads,
-            const char * cgraph_fname) {
+            const char * cgraph_fname,
+            int bos_token,
+            int eos_token) {
 
     // enforce that the first token is BOS
-    if (n_past == 0 && tokens[0] != llama_token_bos()) {
+    if (n_past == 0 && tokens[0] != bos_token) {
         fprintf(stderr, "%s: first token must be BOS\n", __func__);
         return false;
     }
 
+    // enforce that the last token is EOS
+    // if (n_past == 0 && tokens[-1] != eos_token) {
+    //     fprintf(stderr, "%s: last token must be EOS\n", __func__);
+    //     return false;
+    // }
+
     const int64_t t_start_us = ggml_time_us();
 
     const int N = n_tokens;
@@ -1925,7 +1933,7 @@ private:
     llama_sp_bigram::queue work_queue_;
 };
 
-static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
+static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, int bos_token, int eos_token) {
     llama_tokenizer tokenizer(vocab);
     std::vector<llama_vocab::id> output;
 
@@ -1933,11 +1941,16 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
         return output;
     }
 
-    if (bos) {
-        output.push_back(llama_token_bos());
+    if (bos_token != 0) {
+        output.push_back(bos_token);
     }
 
     tokenizer.tokenize(text, output);
+
+    if (eos_token != 0) {
+        output.push_back(eos_token);
+    }
+
     return output;
 }
 
@@ -3407,8 +3420,10 @@ int llama_eval(
            const llama_token * tokens,
                          int   n_tokens,
                          int   n_past,
-                         int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
+                         int   n_threads,
+                         int bos_token,
+                         int eos_token) {
+    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr, bos_token, eos_token)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3423,13 +3438,13 @@ int llama_eval(
     return 0;
 }
 
-int llama_eval_export(struct llama_context * ctx, const char * fname) {
+int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token = 1, int eos_token = 2) {
     const int n_batch = 1;
     const int n_ctx   = 512 - n_batch;
 
-    const std::vector<llama_token> tmp(n_batch, llama_token_bos());
+    const std::vector<llama_token> tmp(n_batch, bos_token);
 
-    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
+    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname, bos_token, eos_token)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3442,8 +3457,9 @@ int llama_tokenize(
                   const char * text,
                  llama_token * tokens,
                          int   n_max_tokens,
-                        bool   add_bos) {
-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
+                        bool   add_bos,
+                        bool   add_eos) {
+    auto res = llama_tokenize(ctx->vocab, text, add_bos, add_eos);
 
     if (n_max_tokens < (int) res.size()) {
         fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3498,14 +3514,6 @@ const char * llama_token_to_str(const struct llama_context * ctx, llama_token to
     return ctx->vocab.id_to_token[token].tok.c_str();
 }
 
-llama_token llama_token_bos() {
-    return 1;
-}
-
-llama_token llama_token_eos() {
-    return 2;
-}
-
 llama_token llama_token_nl() {
     return 13;
 }
diff --git a/llama.h b/llama.h
index a833a7f4d..4b6edd1d1 100644
--- a/llama.h
+++ b/llama.h
@@ -223,13 +223,15 @@ extern "C" {
                const llama_token * tokens,
                              int   n_tokens,
                              int   n_past,
-                             int   n_threads);
+                             int   n_threads,
+                             int bos_token,
+                             int eos_token);
 
     // Export a static computation graph for context of 511 and batch size of 1
     // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
     //       parameters here to keep things simple
     // IMPORTANT: do not use for anything else other than debugging and testing!
-    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token, int eos_token);
 
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
@@ -241,7 +243,8 @@ extern "C" {
                       const char * text,
                      llama_token * tokens,
                              int   n_max_tokens,
-                            bool   add_bos);
+                            bool   add_bos,
+                            bool   add_eos);
 
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
@@ -270,8 +273,8 @@ extern "C" {
     LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 
     // Special tokens
-    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+//     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+//     LLAMA_API llama_token llama_token_eos();  // end-of-sentence
     LLAMA_API llama_token llama_token_nl();   // next-line
 
     // Sampling functions