llama : fix bpe tokenize from byte (#2889)

This commit is contained in:
opparco 2023-09-03 19:18:09 +09:00 committed by GitHub
parent d9151e6f57
commit 3730134776
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j); std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str); auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) { if (token_multibyte == vocab.token_to_id.end()) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); try {
llama_token token_byte = llama_byte_to_token(vocab, *j);
output.push_back(token_byte);
} catch (const std::out_of_range & err) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
}
} else {
output.push_back((*token_multibyte).second);
} }
output.push_back((*token_multibyte).second);
} }
} else { } else {
output.push_back((*token).second); output.push_back((*token).second);