llama.cpp/tests/test-tokenizer-0.cpp

#include "utils.h"
#include "llama.h"

#include <cstdio>
#include <string>
#include <map>

static const std::map<std::string, std::vector<llama_token>> k_tests = {
    { "Hello World",        { 1,  10994,   2787, }, },
    { " Hello World",       { 1,  15043,   2787, }, },
    { " Hello World!",      { 1,  15043,   2787,  29991, }, },
    { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
    { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
    { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
};

int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
        return 1;
    }

    const std::string fname = argv[1];

    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());

    llama_context * ctx;

    // load the vocab
    {
        auto lparams = llama_context_default_params();

        lparams.vocab_only = true;

        ctx = llama_init_from_file(fname.c_str(), lparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
    }

    const int n_vocab = llama_n_vocab(ctx);

    if (n_vocab != 32000) {
        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
        return 2;
    }

    for (const auto & test_kv : k_tests) {
        const auto res = ::llama_tokenize(ctx, test_kv.first, true);

        bool correct = res.size() == test_kv.second.size();

        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (res[i] != test_kv.second[i]) {
                correct = false;
            }
        }

        if (!correct) {
            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");

            return 3;
        }
    }

    return 0;
}
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00			`#include "utils.h"`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`#include "llama.h"`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00
			`#include <cstdio>`
			`#include <string>`
			`#include <map>`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`static const std::map<std::string, std::vector<llama_token>> k_tests = {`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00			`{ "Hello World", { 1, 10994, 2787, }, },`
			`{ " Hello World", { 1, 15043, 2787, }, },`
			`{ " Hello World!", { 1, 15043, 2787, 29991, }, },`
			`{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },`
			`{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },`
			`{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },`
			`};`

			`int main(int argc, char **argv) {`
			`if (argc < 2) {`
			`fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);`
			`return 1;`
			`}`

			`const std::string fname = argv[1];`

			`fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`llama_context * ctx;`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`// load the vocab`
			`{`
			`auto lparams = llama_context_default_params();`

			`lparams.vocab_only = true;`

			`ctx = llama_init_from_file(fname.c_str(), lparams);`

			`if (ctx == NULL) {`
			`fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());`
			`return 1;`
			`}`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00			`}`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`const int n_vocab = llama_n_vocab(ctx);`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00
			`if (n_vocab != 32000) {`
			`fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);`
			`return 2;`
			`}`

			`for (const auto & test_kv : k_tests) {`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00			`const auto res = ::llama_tokenize(ctx, test_kv.first, true);`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00
			`bool correct = res.size() == test_kv.second.size();`

			`for (int i = 0; i < (int) res.size() && correct; ++i) {`
			`if (res[i] != test_kv.second[i]) {`
			`correct = false;`
			`}`
			`}`

			`if (!correct) {`
			`fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());`
			`fprintf(stderr, "%s : expected tokens: ", __func__);`
			`for (const auto & t : test_kv.second) {`
			`fprintf(stderr, "%6d, ", t);`
			`}`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, "%s : got tokens: ", __func__);`
			`for (const auto & t : res) {`
			`fprintf(stderr, "%6d, ", t);`
			`}`
			`fprintf(stderr, "\n");`

			`return 3;`
			`}`
			`}`

			`return 0;`
			`}`