mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
draft for #1776 making bos and eos available for user input instead of hard coded
This commit is contained in:
parent
5ec8dd5a3c
commit
afee3cfc1f
@ -356,7 +356,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
} else if (arg == "--perplexity") {
|
} else if (arg == "--perplexity") {
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.logit_bias[llama_token_eos()] = -INFINITY;
|
params.logit_bias[params.eos_token] = -INFINITY;
|
||||||
} else if (arg == "--no-penalize-nl") {
|
} else if (arg == "--no-penalize-nl") {
|
||||||
params.penalize_nl = false;
|
params.penalize_nl = false;
|
||||||
} else if (arg == "-l" || arg == "--logit-bias") {
|
} else if (arg == "-l" || arg == "--logit-bias") {
|
||||||
@ -526,10 +526,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: not great allocating this every time
|
// TODO: not great allocating this every time
|
||||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos) {
|
||||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||||
std::vector<llama_token> res(text.size() + (int) add_bos);
|
std::vector<llama_token> res(text.size() + (int) add_bos + (int) add_eos);
|
||||||
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos, add_eos);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
||||||
|
@ -78,6 +78,9 @@ struct gpt_params {
|
|||||||
bool mem_test = false; // compute maximum memory usage
|
bool mem_test = false; // compute maximum memory usage
|
||||||
bool export_cgraph = false; // export the computation graph
|
bool export_cgraph = false; // export the computation graph
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
|
||||||
|
int bos_token = 1; // beginning of sentence token
|
||||||
|
int eos_token = 2; // end of sentence token
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
@ -90,7 +93,7 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
|||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos, bool add_eos);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
@ -60,7 +60,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt.insert(0, 1, ' ');
|
params.prompt.insert(0, 1, ' ');
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
@ -74,7 +74,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (params.embedding){
|
if (params.embedding){
|
||||||
if (embd_inp.size() > 0) {
|
if (embd_inp.size() > 0) {
|
||||||
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
|
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads, params.bos_token, params.eos_token)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
|
|||||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||||
if (params.mem_test) {
|
if (params.mem_test) {
|
||||||
{
|
{
|
||||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
|
const std::vector<llama_token> tmp(params.n_batch, params.bos_token);
|
||||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const std::vector<llama_token> tmp = { 0, };
|
const std::vector<llama_token> tmp = { 0, };
|
||||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
|
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads, params.bos_token, params.eos_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// export the cgraph and exit
|
// export the cgraph and exit
|
||||||
if (params.export_cgraph) {
|
if (params.export_cgraph) {
|
||||||
llama_eval_export(ctx, "llama.ggml");
|
llama_eval_export(ctx, "llama.ggml", params.bos_token, params.eos_token);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
|
|||||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
params.prompt.insert(0, 1, ' ');
|
params.prompt.insert(0, 1, ' ');
|
||||||
|
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
@ -234,8 +234,8 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, false);
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
@ -249,7 +249,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// determine newline token
|
// determine newline token
|
||||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false, false);
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
@ -342,8 +342,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// do one empty run to warm up the model
|
// do one empty run to warm up the model
|
||||||
{
|
{
|
||||||
const std::vector<llama_token> tmp = { llama_token_bos(), };
|
const std::vector<llama_token> tmp = { params.bos_token, };
|
||||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.bos_token, params.eos_token);
|
||||||
llama_reset_timings(ctx);
|
llama_reset_timings(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (n_eval > params.n_batch) {
|
if (n_eval > params.n_batch) {
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads, params.bos_token, params.eos_token)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -516,11 +516,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// replace end of text token with newline token when in interactive mode
|
// replace end of text token with newline token when in interactive mode
|
||||||
if (id == llama_token_eos() && params.interactive && !params.instruct) {
|
if (id == params.eos_token && params.interactive && !params.instruct) {
|
||||||
id = llama_token_newline.front();
|
id = llama_token_newline.front();
|
||||||
if (params.antiprompt.size() != 0) {
|
if (params.antiprompt.size() != 0) {
|
||||||
// tokenize and inject first reverse prompt
|
// tokenize and inject first reverse prompt
|
||||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, false);
|
||||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -626,7 +626,7 @@ int main(int argc, char ** argv) {
|
|||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
// instruct mode: insert response suffix
|
// instruct mode: insert response suffix
|
||||||
@ -646,7 +646,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
if (!embd.empty() && embd.back() == params.eos_token) {
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -30,7 +30,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
|||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
@ -60,10 +60,10 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// add BOS token for the first batch of each chunk
|
// add BOS token for the first batch of each chunk
|
||||||
if (j == 0) {
|
if (j == 0) {
|
||||||
tokens[batch_start] = llama_token_bos();
|
tokens[batch_start] = params.bos_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads, params.bos_token, params.eos_token)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -261,7 +261,7 @@ struct llama_server_context {
|
|||||||
|
|
||||||
if (params.n_predict == 0) {
|
if (params.n_predict == 0) {
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
return llama_token_eos();
|
return params.eos_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
@ -344,7 +344,7 @@ struct llama_server_context {
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
if (!embd.empty() && embd.back() == params.eos_token) {
|
||||||
//stopping_word = llama_token_to_str(ctx, embd.back());
|
//stopping_word = llama_token_to_str(ctx, embd.back());
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
stopped_eos = true;
|
stopped_eos = true;
|
||||||
@ -644,7 +644,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static json format_generation_settings(llama_server_context & llama) {
|
static json format_generation_settings(llama_server_context & llama) {
|
||||||
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
|
const auto eos_bias = llama.params.logit_bias.find(llama.params.eos_token);
|
||||||
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
|
||||||
@ -731,7 +731,7 @@ static void parse_options_completion(const json & body, llama_server_context & l
|
|||||||
|
|
||||||
llama.params.logit_bias.clear();
|
llama.params.logit_bias.clear();
|
||||||
if (body.value("ignore_eos", false)) {
|
if (body.value("ignore_eos", false)) {
|
||||||
llama.params.logit_bias[llama_token_eos()] = -INFINITY;
|
llama.params.logit_bias[default_params.eos_token] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & logit_bias = body.find("logit_bias");
|
const auto & logit_bias = body.find("logit_bias");
|
||||||
|
@ -84,7 +84,7 @@ int main(int argc, char ** argv)
|
|||||||
//---------------------------------
|
//---------------------------------
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
|
tokens_list = ::llama_tokenize( ctx , params.prompt , true, true );
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx( ctx );
|
const int max_context_size = llama_n_ctx( ctx );
|
||||||
const int max_tokens_list_size = max_context_size - 4 ;
|
const int max_tokens_list_size = max_context_size - 4 ;
|
||||||
@ -123,7 +123,7 @@ int main(int argc, char ** argv)
|
|||||||
// Evaluate the tokens :
|
// Evaluate the tokens :
|
||||||
//---------------------------------
|
//---------------------------------
|
||||||
|
|
||||||
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
|
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads, params.bos_token, params.eos_token ) )
|
||||||
{
|
{
|
||||||
fprintf( stderr, "%s : failed to eval\n" , __func__ );
|
fprintf( stderr, "%s : failed to eval\n" , __func__ );
|
||||||
return 1;
|
return 1;
|
||||||
@ -155,7 +155,7 @@ int main(int argc, char ** argv)
|
|||||||
|
|
||||||
|
|
||||||
// is it an end of stream ?
|
// is it an end of stream ?
|
||||||
if ( new_token_id == llama_token_eos() )
|
if ( new_token_id == params.eos_token )
|
||||||
{
|
{
|
||||||
fprintf(stderr, " [end of text]\n");
|
fprintf(stderr, " [end of text]\n");
|
||||||
break;
|
break;
|
||||||
|
@ -2003,7 +2003,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
|
void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, llama_token bos_token = 1) {
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
int n_vocab = target_logits->ne[0];
|
int n_vocab = target_logits->ne[0];
|
||||||
|
|
||||||
@ -2012,7 +2012,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
|
|||||||
|
|
||||||
ggml_set_f32(target_logits, -1.0f/n_vocab);
|
ggml_set_f32(target_logits, -1.0f/n_vocab);
|
||||||
ggml_set_f32(target_probs, 0.0f);
|
ggml_set_f32(target_probs, 0.0f);
|
||||||
ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
|
ggml_set_i32_1d(tokens_input, 0, bos_token);
|
||||||
for (int i=1; i<n_tokens+1; ++i) {
|
for (int i=1; i<n_tokens+1; ++i) {
|
||||||
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
|
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
|
||||||
set_f32_2d(target_logits, token, i-1, +1.0f);
|
set_f32_2d(target_logits, token, i-1, +1.0f);
|
||||||
@ -2023,7 +2023,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
|
void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int bos_token, int eos_token) {
|
||||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||||
GGML_ASSERT(target_logits->n_dims == 3);
|
GGML_ASSERT(target_logits->n_dims == 3);
|
||||||
GGML_ASSERT(target_probs->n_dims == 3);
|
GGML_ASSERT(target_probs->n_dims == 3);
|
||||||
@ -2043,7 +2043,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
|
|||||||
size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
|
size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
|
||||||
GGML_ASSERT(sample+n_tokens-1 < n_train_data);
|
GGML_ASSERT(sample+n_tokens-1 < n_train_data);
|
||||||
|
|
||||||
set_i32_2d(tokens_input, 0, k, llama_token_bos());
|
set_i32_2d(tokens_input, 0, k, bos_token);
|
||||||
for (int i=1; i<n_tokens+1; ++i) {
|
for (int i=1; i<n_tokens+1; ++i) {
|
||||||
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
|
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
|
||||||
// print_token(lctx, token);
|
// print_token(lctx, token);
|
||||||
@ -2198,7 +2198,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
|||||||
|
|
||||||
out.resize(buf.size());
|
out.resize(buf.size());
|
||||||
|
|
||||||
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
|
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false, false);
|
||||||
if (n_tokens >= 0) {
|
if (n_tokens >= 0) {
|
||||||
out.resize(n_tokens);
|
out.resize(n_tokens);
|
||||||
}
|
}
|
||||||
@ -2698,6 +2698,9 @@ struct train_params {
|
|||||||
int print_info_interval;
|
int print_info_interval;
|
||||||
int print_details_interval;
|
int print_details_interval;
|
||||||
|
|
||||||
|
int bos_token;
|
||||||
|
int eos_token;
|
||||||
|
|
||||||
bool samples_start_after_nl;
|
bool samples_start_after_nl;
|
||||||
bool use_adam;
|
bool use_adam;
|
||||||
bool use_flash;
|
bool use_flash;
|
||||||
@ -3231,7 +3234,7 @@ int main(int argc, char ** argv) {
|
|||||||
gf->n_threads = params.n_threads;
|
gf->n_threads = params.n_threads;
|
||||||
gb->n_threads = params.n_threads;
|
gb->n_threads = params.n_threads;
|
||||||
|
|
||||||
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs, params.bos_token, params.eos_token);
|
||||||
|
|
||||||
GGML_ASSERT(n_past == 0);
|
GGML_ASSERT(n_past == 0);
|
||||||
|
|
||||||
|
48
llama.cpp
48
llama.cpp
@ -1373,14 +1373,22 @@ static bool llama_eval_internal(
|
|||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const int n_threads,
|
const int n_threads,
|
||||||
const char * cgraph_fname) {
|
const char * cgraph_fname,
|
||||||
|
int bos_token,
|
||||||
|
int eos_token) {
|
||||||
|
|
||||||
// enforce that the first token is BOS
|
// enforce that the first token is BOS
|
||||||
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
if (n_past == 0 && tokens[0] != bos_token) {
|
||||||
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// enforce that the last token is EOS
|
||||||
|
// if (n_past == 0 && tokens[-1] != eos_token) {
|
||||||
|
// fprintf(stderr, "%s: last token must be EOS\n", __func__);
|
||||||
|
// return false;
|
||||||
|
// }
|
||||||
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
@ -1925,7 +1933,7 @@ private:
|
|||||||
llama_sp_bigram::queue work_queue_;
|
llama_sp_bigram::queue work_queue_;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, int bos_token, int eos_token) {
|
||||||
llama_tokenizer tokenizer(vocab);
|
llama_tokenizer tokenizer(vocab);
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
@ -1933,11 +1941,16 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bos) {
|
if (bos_token != 0) {
|
||||||
output.push_back(llama_token_bos());
|
output.push_back(bos_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer.tokenize(text, output);
|
tokenizer.tokenize(text, output);
|
||||||
|
|
||||||
|
if (eos_token != 0) {
|
||||||
|
output.push_back(eos_token);
|
||||||
|
}
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3407,8 +3420,10 @@ int llama_eval(
|
|||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past,
|
int n_past,
|
||||||
int n_threads) {
|
int n_threads,
|
||||||
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
int bos_token,
|
||||||
|
int eos_token) {
|
||||||
|
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr, bos_token, eos_token)) {
|
||||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -3423,13 +3438,13 @@ int llama_eval(
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token = 1, int eos_token = 2) {
|
||||||
const int n_batch = 1;
|
const int n_batch = 1;
|
||||||
const int n_ctx = 512 - n_batch;
|
const int n_ctx = 512 - n_batch;
|
||||||
|
|
||||||
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
const std::vector<llama_token> tmp(n_batch, bos_token);
|
||||||
|
|
||||||
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname, bos_token, eos_token)) {
|
||||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -3442,8 +3457,9 @@ int llama_tokenize(
|
|||||||
const char * text,
|
const char * text,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos) {
|
bool add_bos,
|
||||||
auto res = llama_tokenize(ctx->vocab, text, add_bos);
|
bool add_eos) {
|
||||||
|
auto res = llama_tokenize(ctx->vocab, text, add_bos, add_eos);
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
fprintf(stderr, "%s: too many tokens\n", __func__);
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
||||||
@ -3498,14 +3514,6 @@ const char * llama_token_to_str(const struct llama_context * ctx, llama_token to
|
|||||||
return ctx->vocab.id_to_token[token].tok.c_str();
|
return ctx->vocab.id_to_token[token].tok.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos() {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_token_eos() {
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_token_nl() {
|
llama_token llama_token_nl() {
|
||||||
return 13;
|
return 13;
|
||||||
}
|
}
|
||||||
|
13
llama.h
13
llama.h
@ -223,13 +223,15 @@ extern "C" {
|
|||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past,
|
int n_past,
|
||||||
int n_threads);
|
int n_threads,
|
||||||
|
int bos_token,
|
||||||
|
int eos_token);
|
||||||
|
|
||||||
// Export a static computation graph for context of 511 and batch size of 1
|
// Export a static computation graph for context of 511 and batch size of 1
|
||||||
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
||||||
// parameters here to keep things simple
|
// parameters here to keep things simple
|
||||||
// IMPORTANT: do not use for anything else other than debugging and testing!
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
||||||
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname, int bos_token, int eos_token);
|
||||||
|
|
||||||
// Convert the provided text into tokens.
|
// Convert the provided text into tokens.
|
||||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
@ -241,7 +243,8 @@ extern "C" {
|
|||||||
const char * text,
|
const char * text,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos,
|
||||||
|
bool add_eos);
|
||||||
|
|
||||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||||
@ -270,8 +273,8 @@ extern "C" {
|
|||||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
// LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
// LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||||
|
|
||||||
// Sampling functions
|
// Sampling functions
|
||||||
|
Loading…
Reference in New Issue
Block a user