mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
llama : fix whitespace escaping in tokenizer (#2724)
This commit is contained in:
parent
c63bb1d16a
commit
46ef5b5fcf
13
llama.cpp
13
llama.cpp
@ -2253,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_escape_whitespace(const std::string& text) {
|
static std::string llama_escape_whitespace(const std::string& text) {
|
||||||
std::string result;
|
std::string result = "\xe2\x96\x81";
|
||||||
bool escaping = false;
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
if (text[offs] == ' ') {
|
if (text[offs] == ' ') {
|
||||||
if (!escaping) {
|
result += "\xe2\x96\x81";
|
||||||
result += "\xe2\x96\x81";
|
} else {
|
||||||
escaping = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
escaping = false;
|
|
||||||
result += text[offs];
|
result += text[offs];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
|
|||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
{ " ", {1, 259, }, },
|
{ " ", {1, 259, }, },
|
||||||
|
{ " ", { 1, 1678, }, },
|
||||||
|
{ " ", { 1, 268, }, },
|
||||||
{ "\t", { 1, 29871, 12, }, },
|
{ "\t", { 1, 29871, 12, }, },
|
||||||
{ "\n", { 1, 29871, 13, }, },
|
{ "\n", { 1, 29871, 13, }, },
|
||||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
{ "\t\n", { 1, 29871, 12, 13, }, },
|
||||||
@ -38,6 +40,12 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|||||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
||||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
||||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||||
|
{ "Hello", { 1, 15043 }, },
|
||||||
|
{ " Hello", { 1, 29871, 15043 }, },
|
||||||
|
{ " Hello", { 1, 259, 15043 }, },
|
||||||
|
{ " Hello", { 1, 1678, 15043 }, },
|
||||||
|
{ " Hello", { 1, 268, 15043 }, },
|
||||||
|
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
|
||||||
};
|
};
|
||||||
|
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
@ -106,7 +114,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (!correct) {
|
if (!correct) {
|
||||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||||
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||||
|
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
|
||||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||||
for (const auto & t : test_kv.second) {
|
for (const auto & t : test_kv.second) {
|
||||||
fprintf(stderr, "%6d, ", t);
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
@ -11,18 +11,11 @@
|
|||||||
#include <locale>
|
#include <locale>
|
||||||
|
|
||||||
static std::string escape_whitespace(const std::string& text) {
|
static std::string escape_whitespace(const std::string& text) {
|
||||||
std::string result;
|
std::string result = "\xe2\x96\x81";
|
||||||
bool escaping = false;
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
if (text[offs] == ' ') {
|
if (text[offs] == ' ') {
|
||||||
if (!escaping) {
|
result += "\xe2\x96\x81";
|
||||||
result += "\xe2\x96\x81";
|
} else {
|
||||||
escaping = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
escaping = false;
|
|
||||||
result += text[offs];
|
result += text[offs];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user