cmpnct_gpt2bpe.hpp : comments

This commit is contained in:
klosax 2023-08-04 04:57:51 +02:00 committed by GitHub
parent 278ada9572
commit db5618ad99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,12 +11,15 @@
#include <queue> #include <queue>
#include <cstring> #include <cstring>
//-----
// Unicode GPT2 Byte Pair Encoding Tokenizer
// Adapted from https://github.com/cmp-nct/ggllm.cpp
//-----
/** // Unicode library (from cmpnct_unicode.cpp)
* https://github.com/cmp-nct/ggllm.cpp
* Minimal library for high performance handling and categorization of UTF8 strings and characters // Minimal library for high performance handling and categorization of UTF8 strings and characters
* Using std::string // Using std::string
*/
enum CNCTCharType { enum CNCTCharType {
DIGIT, // a numerical char in any language DIGIT, // a numerical char in any language
@ -367,7 +370,7 @@ bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
return true; return true;
} }
// Ported from libfalcon.cpp (https://github.com/cmp-nct/ggllm.cpp) // llama.cpp GPT2 vocab (from libfalcon.cpp)
std::string replaceAll(std::string str, const std::string& from, const std::string& to) { std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
size_t start_pos = 0; size_t start_pos = 0;