Add tests

This commit is contained in:
Howard Su 2023-07-02 22:06:10 +08:00
parent e818537027
commit 6caa06638f

View File

@ -14,6 +14,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
{ ">>>>ANSWER<<", { 1, 6778, 6778, 2190, 23066, 1001, 9314,}, },
}; };
return _k_tests; return _k_tests;
}; };
@ -94,6 +95,38 @@ int main(int argc, char **argv) {
} }
} }
#if 0
// how many tokens would not tokenize to themselves
for (llama_token i = 1; i < llama_n_vocab(ctx); i++)
{
const char* str = llama_token_to_str(ctx, i);
std::vector<llama_token> res(100);
const int n = llama_tokenize(ctx, str, res.data(), int(res.size()), false);
res.resize(n);
for (const auto & t : res)
{
//if (t == 1) continue;
if (t != i) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, str);
fprintf(stderr, "%s : expected tokens: %d\n", __func__, i);
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d, ", t);
}
for (const auto & t : res) {
fprintf(stderr, "%s|", llama_token_to_str(ctx, t));
}
fprintf(stderr, "\n");
}
}
}
#endif
llama_free_model(model); llama_free_model(model);
llama_free(ctx); llama_free(ctx);