Add tests

2025-01-12 03:31:46 +00:00 · 2023-07-02 22:06:10 +08:00 · 2023-07-02 22:06:10 +08:00 · 6caa06638f
commit 6caa06638f
parent e818537027
1 changed files with 33 additions and 0 deletions
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -14,6 +14,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
        { ">>>>ANSWER<<",       { 1,   6778,   6778,   2190,  23066,   1001,   9314,}, },
    };
    return _k_tests;
 };
@ -94,6 +95,38 @@ int main(int argc, char **argv) {
        }
    }
 #if 0
    // how many tokens would not tokenize to themselves
    for (llama_token i = 1; i < llama_n_vocab(ctx); i++)
    {
        const char* str = llama_token_to_str(ctx, i);
        std::vector<llama_token> res(100);
        const int n = llama_tokenize(ctx, str, res.data(), int(res.size()), false);
        res.resize(n);
        for (const auto & t : res)
        {
            //if (t == 1) continue;
            if (t != i) {
                fprintf(stderr, "%s : failed test: '%s'\n", __func__, str);
                fprintf(stderr, "%s : expected tokens: %d\n", __func__, i);
                fprintf(stderr, "%s : got tokens:      ", __func__);
                for (const auto & t : res) {
                    fprintf(stderr, "%6d, ", t);
                }
                for (const auto & t : res) {
                    fprintf(stderr, "%s|", llama_token_to_str(ctx, t));
                }
                fprintf(stderr, "\n");
            }
        }
    }
 #endif
    llama_free_model(model);
    llama_free(ctx);