llama : fix whitespace escaping in tokenizer (#2724)

This commit is contained in:
goerch 2023-08-22 23:10:42 +02:00 committed by GitHub
parent c63bb1d16a
commit 46ef5b5fcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 21 deletions

View File

@ -2253,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
} }
static std::string llama_escape_whitespace(const std::string& text) { static std::string llama_escape_whitespace(const std::string& text) {
std::string result; std::string result = "\xe2\x96\x81";
bool escaping = false;
result += "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) { for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') { if (text[offs] == ' ') {
if (!escaping) { result += "\xe2\x96\x81";
result += "\xe2\x96\x81"; } else {
escaping = true;
}
}
else {
escaping = false;
result += text[offs]; result += text[offs];
} }
} }

View File

@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
static const std::map<std::string, std::vector<llama_token>> & k_tests() { static const std::map<std::string, std::vector<llama_token>> & k_tests() {
static std::map<std::string, std::vector<llama_token>> _k_tests = { static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ " ", {1, 259, }, }, { " ", {1, 259, }, },
{ " ", { 1, 1678, }, },
{ " ", { 1, 268, }, },
{ "\t", { 1, 29871, 12, }, }, { "\t", { 1, 29871, 12, }, },
{ "\n", { 1, 29871, 13, }, }, { "\n", { 1, 29871, 13, }, },
{ "\t\n", { 1, 29871, 12, 13, }, }, { "\t\n", { 1, 29871, 12, 13, }, },
@ -38,6 +40,12 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
{ "Hello", { 1, 15043 }, },
{ " Hello", { 1, 29871, 15043 }, },
{ " Hello", { 1, 259, 15043 }, },
{ " Hello", { 1, 1678, 15043 }, },
{ " Hello", { 1, 268, 15043 }, },
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
}; };
return _k_tests; return _k_tests;
@ -106,7 +114,8 @@ int main(int argc, char **argv) {
if (!correct) { if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__); fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) { for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t); fprintf(stderr, "%6d, ", t);

View File

@ -11,18 +11,11 @@
#include <locale> #include <locale>
static std::string escape_whitespace(const std::string& text) { static std::string escape_whitespace(const std::string& text) {
std::string result; std::string result = "\xe2\x96\x81";
bool escaping = false;
result += "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) { for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') { if (text[offs] == ' ') {
if (!escaping) { result += "\xe2\x96\x81";
result += "\xe2\x96\x81"; } else {
escaping = true;
}
}
else {
escaping = false;
result += text[offs]; result += text[offs];
} }
} }