cmpnct_gpt2bpe.hpp : comments

This commit is contained in:
klosax 2023-08-04 04:57:51 +02:00 committed by GitHub
parent 278ada9572
commit db5618ad99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,12 +11,15 @@
#include <queue>
#include <cstring>
//-----
// Unicode GPT2 Byte Pair Encoding Tokenizer
// Adapted from https://github.com/cmp-nct/ggllm.cpp
//-----
/**
* https://github.com/cmp-nct/ggllm.cpp
* Minimal library for high performance handling and categorization of UTF8 strings and characters
* Using std::string
*/
// Unicode library (from cmpnct_unicode.cpp)
// Minimal library for high performance handling and categorization of UTF8 strings and characters
// Using std::string
enum CNCTCharType {
DIGIT, // a numerical char in any language
@ -367,7 +370,7 @@ bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
return true;
}
// Ported from libfalcon.cpp (https://github.com/cmp-nct/ggllm.cpp)
// llama.cpp GPT2 vocab (from libfalcon.cpp)
std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
size_t start_pos = 0;