2024-05-04 05:32:32 +00:00
|
|
|
#!/bin/bash
|
|
|
|
#
|
|
|
|
# Usage:
|
|
|
|
#
|
|
|
|
# test-tokenizer-0.sh <name> <input>
|
|
|
|
#
|
|
|
|
|
|
|
|
if [ $# -ne 2 ]; then
|
|
|
|
printf "Usage: $0 <name> <input>\n"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
name=$1
|
|
|
|
input=$2
|
|
|
|
|
|
|
|
make -j tests/test-tokenizer-0
|
|
|
|
|
|
|
|
printf "Testing %s on %s ...\n" $name $input
|
|
|
|
|
2024-05-21 16:53:48 +00:00
|
|
|
set -e
|
|
|
|
|
|
|
|
printf "Tokenizing using (py) Python AutoTokenizer ...\n"
|
2024-05-04 05:32:32 +00:00
|
|
|
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
|
|
|
|
|
2024-05-21 16:53:48 +00:00
|
|
|
printf "Tokenizing using (cpp) llama.cpp ...\n"
|
2024-05-04 05:32:32 +00:00
|
|
|
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
2024-05-21 16:53:48 +00:00
|
|
|
|
|
|
|
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
2024-05-04 05:32:32 +00:00
|
|
|
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
|
|
|
|
2024-05-28 12:04:09 +00:00
|
|
|
set +e
|
|
|
|
|
2024-05-04 05:32:32 +00:00
|
|
|
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
|
|
|
|
|
|
|
if [ $? -eq 0 ]; then
|
|
|
|
printf "Tokenization is correct!\n"
|
|
|
|
else
|
|
|
|
diff $input.tok $input.tokcpp | head -n 32
|
|
|
|
|
|
|
|
printf "Tokenization differs!\n"
|
|
|
|
fi
|