From db5618ad9919ca4e4fb29b6fa6509923368f4755 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Fri, 4 Aug 2023 04:57:51 +0200 Subject: [PATCH] cmpnct_gpt2bpe.hpp : comments --- cmpnct_gpt2bpe.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cmpnct_gpt2bpe.hpp b/cmpnct_gpt2bpe.hpp index 0743a294a..ac5f8c672 100644 --- a/cmpnct_gpt2bpe.hpp +++ b/cmpnct_gpt2bpe.hpp @@ -11,12 +11,15 @@ #include #include +//----- +// Unicode GPT2 Byte Pair Encoding Tokenizer +// Adapted from https://github.com/cmp-nct/ggllm.cpp +//----- -/** - * https://github.com/cmp-nct/ggllm.cpp - * Minimal library for high performance handling and categorization of UTF8 strings and characters - * Using std::string - */ +// Unicode library (from cmpnct_unicode.cpp) + +// Minimal library for high performance handling and categorization of UTF8 strings and characters +// Using std::string enum CNCTCharType { DIGIT, // a numerical char in any language @@ -367,7 +370,7 @@ bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype) return true; } -// Ported from libfalcon.cpp (https://github.com/cmp-nct/ggllm.cpp) +// llama.cpp GPT2 vocab (from libfalcon.cpp) std::string replaceAll(std::string str, const std::string& from, const std::string& to) { size_t start_pos = 0;