tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)

* tests : add test-tokenizer-0.sh

* unicode : add all unicode number ranges

* starcoder : fix pre-tokenizer

* tests : add test that fails with DeepSeek tokenizers

* falcon : fix regex

* unicode : regenerate unicode tables

* refact : add tokenizer model

* lint : fix

* tests : disable failing tests

ggml-ci

* refact : add tests files

ggml-ci

* convert : print -> logging

ggml-ci

* lint : fix

* unicode : digit -> number

* phi-3 : update
This commit is contained in:
Georgi Gerganov 2024-05-04 08:32:32 +03:00 committed by GitHub
parent a2ac89d6ef
commit 92139b90af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 903 additions and 719 deletions

View File

@ -1,4 +1,4 @@
[flake8]
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude = examples/*,examples/*/**,*/**/__init__.py
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py

View File

@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

View File

@ -31,6 +31,7 @@ from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
@ -62,6 +63,7 @@ models = [
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
]
# make directory "models/tokenizers" if it doesn't exist
@ -158,8 +160,8 @@ src_func = f"""
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"chktok: {{chktok}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
@ -168,22 +170,22 @@ src_func = f"""
# don't edit the hashes manually!
{src_ifs}
if res is None:
print("\\n")
print("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!")
print("** There are 2 possible reasons for this:")
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
print("** - the pre-tokenization config has changed upstream")
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**")
print(f"** chkhsh: {{chkhsh}}")
print("**************************************************************************************")
print("\\n")
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
print(f"tokenizer.ggml.pre: {{repr(res)}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
@ -197,6 +199,8 @@ logger.info("\n")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
@ -281,6 +285,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
for model in models:
name = model["name"]
logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

View File

@ -308,6 +308,9 @@ class Model(ABC):
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if res is None:
logger.warning("\n")
@ -324,7 +327,7 @@ class Model(ABC):
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {res}")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")
return res

View File

@ -4383,6 +4383,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@ -17466,9 +17474,10 @@ int32_t llama_tokenize(
static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}
return decoded_text;

View File

@ -79,6 +79,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
};
// note: these values should be synchronized with ggml_rope

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
29464 2094 1018 1092 2706
11865 17875

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
1050 207 19 207 19192 4217
37 32009 71 6247
207
243

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
1052 207 19 207 19109 4223
37 100014 71 6245
207
243

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
878 204 31 3068 133 2137
28611 132 30042
204
258

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
798 604 25208 1933
37 9116 71 11751
220
220 220

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
1142 220 19 220 27154 4038
37 51853 261
220
256

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261
259
1678

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
728 577 24142 2607
39 26288 6554
209
50276

Binary file not shown.

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261
259
1678

Binary file not shown.

View File

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View File

@ -0,0 +1,43 @@
4833 225 38 225 143 140 17723
56 2006 3935 265
225
261
264
202
203
478
2831
15773
8279 5788
12000 5788
8279 10896
12000 10896
12000 10896 19
8279 30 5788 19
12000 30 5788 19
458 438 5945 118 252 32 3766
105 34 38 42 225 41 102 1707 12530 10180 1479 8278
39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700
14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236
3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27
8279
12000
225 12000
261 12000
264 12000
264 12000 284 12000
308
203 280
25 34666
8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838
37
37 37
37 37 37
37 37 37 37
37 37 37 37 37
37 37 37 37 37 37
37 37 37 37 37 37 37
37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37
334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62

View File

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View File

@ -1,3 +1,5 @@
4850 244 57 244 162 159 17722
75 2022 3943 284
244
280

View File

@ -0,0 +1,66 @@
import regex
def cpt_to_utf8_str(cpt):
if cpt <= 0xFF:
return bytes([cpt, 0, 0, 0])
elif cpt <= 0xFFFF:
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
elif cpt <= 0xFFFFFF:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
else:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
def is_match(codepoint, regex_expr):
try:
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
return res is not None
except Exception:
return False
def get_matches(regex_expr):
unicode_ranges = []
current_range = None
for codepoint in range(0x110000):
if is_match(codepoint, regex_expr):
if current_range is None:
current_range = [codepoint, codepoint]
else:
current_range[1] = codepoint
elif current_range is not None:
unicode_ranges.append(tuple(current_range))
current_range = None
if current_range is not None:
unicode_ranges.append(tuple(current_range))
return unicode_ranges
def print_cat(cat, ranges):
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
cnt = 0
for start, end in ranges:
if cnt % 4 != 0:
print(" ", end="")
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
if cnt % 4 == 3:
print("")
cnt += 1
if cnt % 4 != 0:
print("")
print("};")
print("")
print_cat("number", get_matches(r'\p{N}'))
print_cat("letter", get_matches(r'\p{L}'))
print_cat("whitespace", get_matches(r'\p{Z}'))
print_cat("accent_mark", get_matches(r'\p{M}'))
print_cat("punctuation", get_matches(r'\p{P}'))
print_cat("symbol", get_matches(r'\p{S}'))
print_cat("control", get_matches(r'\p{C}'))

View File

@ -74,13 +74,15 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
# TODO: enable when fixed
# https://github.com/ggerganov/llama.cpp/pull/7036
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
# build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)

View File

@ -1,126 +0,0 @@
# tests with BPE tokenizer
#
# sample usage:
#
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
#
import logging
import argparse
from transformers import AutoTokenizer
logger = logging.getLogger("test-tokenizer-0-bpe")
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
dir_tokenizer = args.dir_tokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
for text in tests:
logger.info(f"text: {text}")
logger.info(tokenizer.encode(text))
logger.info(tokenizer.decode(tokenizer.encode(text)))
logger.info("tests for C++:")
for text in tests:
res = tokenizer.encode(text)
# Modify text representation for logging
k = text.replace('\n', '\\n')
k = k.replace('\t', '\\t')
k = '"' + k + '"'
# Log the modified text and its encoding
log_message = "{ %-24s, { " % k
for x in res:
log_message += "%7d," % x
log_message += " }, },"
logger.info(log_message)
logger.info(tokenizer.encode('hello'))
logger.info(tokenizer.encode('world'))
logger.info(tokenizer.encode(' world'))
logger.info(tokenizer.encode('hello world'))
fname_tok = args.fname_tok
if fname_tok:
logger.info(f"tokenizing file: {fname_tok}")
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
logger.info(f"len(res): {len(res)}")
logger.info(f"len(lines): {len(lines)}")
logger.info(f"results written to: {fname_out}")

View File

@ -1,126 +0,0 @@
# tests with SPM tokenizer
#
# sample usage:
#
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
#
import logging
import argparse
from sentencepiece import SentencePieceProcessor
logger = logging.getLogger("test-tokenizer-0-spm")
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
dir_tokenizer = args.dir_tokenizer
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
for text in tests:
message_log = (f"text: {text}\n"
"with bos:\n"
f"{tokenizer.encode(text, add_bos=True)}\n"
f"{tokenizer.decode(tokenizer.encode(text, add_bos=True))}\n"
"without bos:\n"
f"{tokenizer.encode(text, add_bos=False)}\n"
f"{tokenizer.decode(tokenizer.encode(text, add_bos=False))}\n")
logger.info(message_log)
logger.info(f"'{tokenizer.id_to_piece(15043)}'") # '_Hello'
logger.info(f"'{tokenizer.id_to_piece(29871)}'") # '_'
logger.info(f"'{tokenizer.decode([15043])}'") # 'Hello'
logger.info(f"'{tokenizer.decode([15043, 15043])}'") # 'Hello Hello'
logger.info(f"'{tokenizer.decode([29871, 15043])}'") # ' Hello'
logger.info(f"'{tokenizer.decode([29871, 15043, 29871, 15043])}'") # ' Hello Hello'
logger.info("\n\ntests for C++:\n")
for text in tests:
res = tokenizer.encode(text, add_bos=False)
# Modify text representation for logging
k = text.replace('\n', '\\n')
k = k.replace('\t', '\\t')
k = '"' + k + '"'
# Log the modified text and its encoding
log_message = "{ %-24s, { " % k
for x in res:
log_message += "%7d," % x
log_message += " }, },"
logger.info(log_message)
logger.info(tokenizer.encode('hello'))
logger.info(tokenizer.encode('world'))
logger.info(tokenizer.encode(' world'))
logger.info(tokenizer.encode('hello world'))
fname_tok = args.fname_tok
if fname_tok:
logger.info(f"tokenizing file: {fname_tok}")
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True)
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
logger.info(f"len(res): {len(res)}")
logger.info(f"len(lines): {len(lines)}")
logger.info(f"results written to: {fname_out}")

View File

@ -55,8 +55,10 @@
// return _k_tests;
//}
static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
std::map<std::string, std::vector<llama_token>> tests;
using llama_tests = std::map<std::string, std::vector<llama_token>>;
static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
llama_tests tests;
std::ifstream ifs_inp(fname_inp);
if (!ifs_inp) {
@ -175,13 +177,21 @@ int main(int argc, char **argv) {
bool success = true;
const auto k_tests = read_tests(fname_inp, fname_out);
if (k_tests.empty()) {
fprintf(stderr, "%s : error: no tests found\n", __func__);
return 1;
const auto k_tests = [&]() -> llama_tests {
if (!fname_text.empty()) {
return {};
}
const auto res = read_tests(fname_inp, fname_out);
if (res.empty()) {
fprintf(stderr, "%s : error: no tests found\n", __func__);
exit(1);
}
return res;
}();
const bool add_special = false;
for (const auto & test_kv : k_tests) {
@ -238,7 +248,17 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
std::vector<llama_token> res;
{
const auto t_start = ggml_time_us();
res = llama_tokenize(ctx, text, add_special);
const auto t_end = ggml_time_us();
fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
}
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@ -252,7 +272,8 @@ int main(int argc, char **argv) {
}
for (const auto & tok : res) {
ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
//ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
ofs << tok << "\n";
}
}

46
tests/test-tokenizer-0.py Normal file
View File

@ -0,0 +1,46 @@
import time
import argparse
from transformers import AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True)
args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer
fname_tok = args.fname_tok
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
t_start = time.time()
res = tokenizer.encode(s, add_special_tokens=False)
t_end = time.time()
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)')
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
f.write(str(x) + '\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

34
tests/test-tokenizer-0.sh Executable file
View File

@ -0,0 +1,34 @@
#!/bin/bash
#
# Usage:
#
# test-tokenizer-0.sh <name> <input>
#
if [ $# -ne 2 ]; then
printf "Usage: $0 <name> <input>\n"
exit 1
fi
name=$1
input=$2
make -j tests/test-tokenizer-0
printf "Testing %s on %s ...\n" $name $input
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
diff $input.tok $input.tokcpp > /dev/null 2>&1
if [ $? -eq 0 ]; then
printf "Tokenization is correct!\n"
else
diff $input.tok $input.tokcpp | head -n 32
printf "Tokenization differs!\n"
fi

View File

@ -5,27 +5,47 @@
#include <utility>
#include <vector>
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit = {
{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x00000660, 0x00000669},
{0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, {0x000009E6, 0x000009EF},
{0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, {0x00000B66, 0x00000B6F}, {0x00000BE6, 0x00000BEF},
{0x00000C66, 0x00000C6F}, {0x00000CE6, 0x00000CEF}, {0x00000D66, 0x00000D6F}, {0x00000DE6, 0x00000DEF},
{0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F29}, {0x00001040, 0x00001049},
{0x00001090, 0x00001099}, {0x00001369, 0x00001371}, {0x000017E0, 0x000017E9}, {0x00001810, 0x00001819},
{0x00001946, 0x0000194F}, {0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99},
{0x00001B50, 0x00001B59}, {0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59},
{0x00002070, 0x00002070}, {0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002460, 0x00002468},
{0x00002474, 0x0000247C}, {0x00002488, 0x00002490}, {0x000024EA, 0x000024EA}, {0x000024F5, 0x000024FD},
{0x000024FF, 0x000024FF}, {0x00002776, 0x0000277E}, {0x00002780, 0x00002788}, {0x0000278A, 0x00002792},
{0x0000A620, 0x0000A629}, {0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9},
{0x0000A9F0, 0x0000A9F9}, {0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19},
{0x000104A0, 0x000104A9}, {0x00010A40, 0x00010A43}, {0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E68},
{0x00011052, 0x0001105A}, {0x00011066, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F},
{0x000111D0, 0x000111D9}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, {0x000114D0, 0x000114D9},
{0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x00011739}, {0x000118E0, 0x000118E9},
{0x00011950, 0x00011959}, {0x00011C50, 0x00011C59}, {0x00011D50, 0x00011D59}, {0x00011DA0, 0x00011DA9},
{0x00016A60, 0x00016A69}, {0x00016B50, 0x00016B59}, {0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149},
{0x0001E2F0, 0x0001E2F9}, {0x0001E950, 0x0001E959}, {0x0001F100, 0x0001F10A}, {0x0001FBF0, 0x0001FBF9},
// generated with scripts/gen-unicode-data.py
//
// TODO: generate unicode_map_lowercase
// TODO: generate unicode_map_nfd
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number = {
{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x000000BC, 0x000000BE},
{0x00000660, 0x00000669}, {0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F},
{0x000009E6, 0x000009EF}, {0x000009F4, 0x000009F9}, {0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF},
{0x00000B66, 0x00000B6F}, {0x00000B72, 0x00000B77}, {0x00000BE6, 0x00000BF2}, {0x00000C66, 0x00000C6F},
{0x00000C78, 0x00000C7E}, {0x00000CE6, 0x00000CEF}, {0x00000D58, 0x00000D5E}, {0x00000D66, 0x00000D78},
{0x00000DE6, 0x00000DEF}, {0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F33},
{0x00001040, 0x00001049}, {0x00001090, 0x00001099}, {0x00001369, 0x0000137C}, {0x000016EE, 0x000016F0},
{0x000017E0, 0x000017E9}, {0x000017F0, 0x000017F9}, {0x00001810, 0x00001819}, {0x00001946, 0x0000194F},
{0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, {0x00001B50, 0x00001B59},
{0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, {0x00002070, 0x00002070},
{0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002150, 0x00002182}, {0x00002185, 0x00002189},
{0x00002460, 0x0000249B}, {0x000024EA, 0x000024FF}, {0x00002776, 0x00002793}, {0x00002CFD, 0x00002CFD},
{0x00003007, 0x00003007}, {0x00003021, 0x00003029}, {0x00003038, 0x0000303A}, {0x00003192, 0x00003195},
{0x00003220, 0x00003229}, {0x00003248, 0x0000324F}, {0x00003251, 0x0000325F}, {0x00003280, 0x00003289},
{0x000032B1, 0x000032BF}, {0x0000A620, 0x0000A629}, {0x0000A6E6, 0x0000A6EF}, {0x0000A830, 0x0000A835},
{0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, {0x0000A9F0, 0x0000A9F9},
{0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, {0x00010107, 0x00010133},
{0x00010140, 0x00010178}, {0x0001018A, 0x0001018B}, {0x000102E1, 0x000102FB}, {0x00010320, 0x00010323},
{0x00010341, 0x00010341}, {0x0001034A, 0x0001034A}, {0x000103D1, 0x000103D5}, {0x000104A0, 0x000104A9},
{0x00010858, 0x0001085F}, {0x00010879, 0x0001087F}, {0x000108A7, 0x000108AF}, {0x000108FB, 0x000108FF},
{0x00010916, 0x0001091B}, {0x000109BC, 0x000109BD}, {0x000109C0, 0x000109CF}, {0x000109D2, 0x000109FF},
{0x00010A40, 0x00010A48}, {0x00010A7D, 0x00010A7E}, {0x00010A9D, 0x00010A9F}, {0x00010AEB, 0x00010AEF},
{0x00010B58, 0x00010B5F}, {0x00010B78, 0x00010B7F}, {0x00010BA9, 0x00010BAF}, {0x00010CFA, 0x00010CFF},
{0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E7E}, {0x00010F1D, 0x00010F26}, {0x00010F51, 0x00010F54},
{0x00010FC5, 0x00010FCB}, {0x00011052, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F},
{0x000111D0, 0x000111D9}, {0x000111E1, 0x000111F4}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459},
{0x000114D0, 0x000114D9}, {0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x0001173B},
{0x000118E0, 0x000118F2}, {0x00011950, 0x00011959}, {0x00011C50, 0x00011C6C}, {0x00011D50, 0x00011D59},
{0x00011DA0, 0x00011DA9}, {0x00011F50, 0x00011F59}, {0x00011FC0, 0x00011FD4}, {0x00012400, 0x0001246E},
{0x00016A60, 0x00016A69}, {0x00016AC0, 0x00016AC9}, {0x00016B50, 0x00016B59}, {0x00016B5B, 0x00016B61},
{0x00016E80, 0x00016E96}, {0x0001D2C0, 0x0001D2D3}, {0x0001D2E0, 0x0001D2F3}, {0x0001D360, 0x0001D378},
{0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, {0x0001E2F0, 0x0001E2F9}, {0x0001E4F0, 0x0001E4F9},
{0x0001E8C7, 0x0001E8CF}, {0x0001E950, 0x0001E959}, {0x0001EC71, 0x0001ECAB}, {0x0001ECAD, 0x0001ECAF},
{0x0001ECB1, 0x0001ECB4}, {0x0001ED01, 0x0001ED2D}, {0x0001ED2F, 0x0001ED3D}, {0x0001F100, 0x0001F10C},
{0x0001FBF0, 0x0001FBF9},
};
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter = {
@ -41,73 +61,73 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter = {
{0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1},
{0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815},
{0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858},
{0x00000860, 0x0000086A}, {0x000008A0, 0x000008B4}, {0x000008B6, 0x000008C7}, {0x00000904, 0x00000939},
{0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, {0x00000971, 0x00000980},
{0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, {0x000009AA, 0x000009B0},
{0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, {0x000009CE, 0x000009CE},
{0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, {0x000009FC, 0x000009FC},
{0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, {0x00000A2A, 0x00000A30},
{0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, {0x00000A59, 0x00000A5C},
{0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, {0x00000A8F, 0x00000A91},
{0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, {0x00000AB5, 0x00000AB9},
{0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, {0x00000AF9, 0x00000AF9},
{0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, {0x00000B2A, 0x00000B30},
{0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, {0x00000B5C, 0x00000B5D},
{0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, {0x00000B85, 0x00000B8A},
{0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, {0x00000B9C, 0x00000B9C},
{0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, {0x00000BAE, 0x00000BB9},
{0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, {0x00000C12, 0x00000C28},
{0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, {0x00000C60, 0x00000C61},
{0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, {0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8},
{0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, {0x00000CBD, 0x00000CBD}, {0x00000CDE, 0x00000CDE},
{0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, {0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10},
{0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, {0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56},
{0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, {0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1},
{0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, {0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30},
{0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, {0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84},
{0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, {0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0},
{0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, {0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6},
{0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, {0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C},
{0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, {0x0000103F, 0x0000103F}, {0x00001050, 0x00001055},
{0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, {0x00001065, 0x00001066}, {0x0000106E, 0x00001070},
{0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, {0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7},
{0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, {0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D},
{0x00001250, 0x00001256}, {0x00001258, 0x00001258}, {0x0000125A, 0x0000125D}, {0x00001260, 0x00001288},
{0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, {0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE},
{0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, {0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310},
{0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, {0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5},
{0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, {0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A},
{0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, {0x00001700, 0x0000170C}, {0x0000170E, 0x00001711},
{0x00001720, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, {0x0000176E, 0x00001770},
{0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, {0x00001820, 0x00001878},
{0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, {0x000018B0, 0x000018F5},
{0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, {0x00001980, 0x000019AB},
{0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, {0x00001AA7, 0x00001AA7},
{0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4B}, {0x00001B83, 0x00001BA0}, {0x00001BAE, 0x00001BAF},
{0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, {0x00001C5A, 0x00001C7D},
{0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, {0x00001CE9, 0x00001CEC},
{0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, {0x00001D00, 0x00001DBF},
{0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, {0x00001F48, 0x00001F4D},
{0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, {0x00001F5D, 0x00001F5D},
{0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, {0x00001FBE, 0x00001FBE},
{0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, {0x00001FD6, 0x00001FDB},
{0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, {0x00002071, 0x00002071},
{0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, {0x00002107, 0x00002107},
{0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, {0x00002124, 0x00002124},
{0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, {0x0000212F, 0x00002139},
{0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, {0x00002183, 0x00002184},
{0x00002C00, 0x00002C2E}, {0x00002C30, 0x00002C5E}, {0x00002C60, 0x00002CE4}, {0x00002CEB, 0x00002CEE},
{0x00002CF2, 0x00002CF3}, {0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D},
{0x00002D30, 0x00002D67}, {0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6},
{0x00002DA8, 0x00002DAE}, {0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6},
{0x00002DC8, 0x00002DCE}, {0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F},
{0x00003005, 0x00003006}, {0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096},
{0x0000309D, 0x0000309F}, {0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F},
{0x00003131, 0x0000318E}, {0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF},
{0x00004E00, 0x00009FFC}, {0x0000A000, 0x0000A48C}, {0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C},
{0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, {0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D},
{0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, {0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7BF},
{0x0000A7C2, 0x0000A7CA}, {0x0000A7F5, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A},
{0x00000860, 0x0000086A}, {0x00000870, 0x00000887}, {0x00000889, 0x0000088E}, {0x000008A0, 0x000008C9},
{0x00000904, 0x00000939}, {0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961},
{0x00000971, 0x00000980}, {0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8},
{0x000009AA, 0x000009B0}, {0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD},
{0x000009CE, 0x000009CE}, {0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1},
{0x000009FC, 0x000009FC}, {0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28},
{0x00000A2A, 0x00000A30}, {0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39},
{0x00000A59, 0x00000A5C}, {0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D},
{0x00000A8F, 0x00000A91}, {0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3},
{0x00000AB5, 0x00000AB9}, {0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1},
{0x00000AF9, 0x00000AF9}, {0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28},
{0x00000B2A, 0x00000B30}, {0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D},
{0x00000B5C, 0x00000B5D}, {0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83},
{0x00000B85, 0x00000B8A}, {0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A},
{0x00000B9C, 0x00000B9C}, {0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA},
{0x00000BAE, 0x00000BB9}, {0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10},
{0x00000C12, 0x00000C28}, {0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A},
{0x00000C5D, 0x00000C5D}, {0x00000C60, 0x00000C61}, {0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C},
{0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, {0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9},
{0x00000CBD, 0x00000CBD}, {0x00000CDD, 0x00000CDE}, {0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2},
{0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, {0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D},
{0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, {0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F},
{0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, {0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD},
{0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, {0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46},
{0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, {0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3},
{0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, {0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD},
{0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, {0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00},
{0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, {0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A},
{0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, {0x0000105A, 0x0000105D}, {0x00001061, 0x00001061},
{0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, {0x00001075, 0x00001081}, {0x0000108E, 0x0000108E},
{0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, {0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA},
{0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, {0x00001250, 0x00001256}, {0x00001258, 0x00001258},
{0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, {0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0},
{0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, {0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5},
{0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, {0x00001312, 0x00001315}, {0x00001318, 0x0000135A},
{0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, {0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C},
{0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, {0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8},
{0x00001700, 0x00001711}, {0x0000171F, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C},
{0x0000176E, 0x00001770}, {0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC},
{0x00001820, 0x00001878}, {0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA},
{0x000018B0, 0x000018F5}, {0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974},
{0x00001980, 0x000019AB}, {0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54},
{0x00001AA7, 0x00001AA7}, {0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4C}, {0x00001B83, 0x00001BA0},
{0x00001BAE, 0x00001BAF}, {0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F},
{0x00001C5A, 0x00001C7D}, {0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF},
{0x00001CE9, 0x00001CEC}, {0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA},
{0x00001D00, 0x00001DBF}, {0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45},
{0x00001F48, 0x00001F4D}, {0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B},
{0x00001F5D, 0x00001F5D}, {0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC},
{0x00001FBE, 0x00001FBE}, {0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3},
{0x00001FD6, 0x00001FDB}, {0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC},
{0x00002071, 0x00002071}, {0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102},
{0x00002107, 0x00002107}, {0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D},
{0x00002124, 0x00002124}, {0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D},
{0x0000212F, 0x00002139}, {0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E},
{0x00002183, 0x00002184}, {0x00002C00, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, {0x00002CF2, 0x00002CF3},
{0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, {0x00002D30, 0x00002D67},
{0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, {0x00002DA8, 0x00002DAE},
{0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, {0x00002DC8, 0x00002DCE},
{0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, {0x00003005, 0x00003006},
{0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, {0x0000309D, 0x0000309F},
{0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, {0x00003131, 0x0000318E},
{0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, {0x00004E00, 0x0000A48C},
{0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, {0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B},
{0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, {0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F},
{0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7CA}, {0x0000A7D0, 0x0000A7D1}, {0x0000A7D3, 0x0000A7D3},
{0x0000A7D5, 0x0000A7D9}, {0x0000A7F2, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A},
{0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7},
{0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946},
{0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4},
@ -129,51 +149,60 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter = {
{0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349},
{0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF},
{0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527},
{0x00010530, 0x00010563}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767},
{0x00010800, 0x00010805}, {0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838},
{0x0001083C, 0x0001083C}, {0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E},
{0x000108E0, 0x000108F2}, {0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939},
{0x00010980, 0x000109B7}, {0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13},
{0x00010A15, 0x00010A17}, {0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C},
{0x00010AC0, 0x00010AC7}, {0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55},
{0x00010B60, 0x00010B72}, {0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2},
{0x00010CC0, 0x00010CF2}, {0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1},
{0x00010F00, 0x00010F1C}, {0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010FB0, 0x00010FC4},
{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8},
{0x00011103, 0x00011126}, {0x00011144, 0x00011144}, {0x00011147, 0x00011147}, {0x00011150, 0x00011172},
{0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, {0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA},
{0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, {0x00011213, 0x0001122B}, {0x00011280, 0x00011286},
{0x00011288, 0x00011288}, {0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8},
{0x000112B0, 0x000112DE}, {0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328},
{0x0001132A, 0x00011330}, {0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D},
{0x00011350, 0x00011350}, {0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A},
{0x0001145F, 0x00011461}, {0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7},
{0x00011580, 0x000115AE}, {0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644},
{0x00011680, 0x000116AA}, {0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011800, 0x0001182B},
{0x00010530, 0x00010563}, {0x00010570, 0x0001057A}, {0x0001057C, 0x0001058A}, {0x0001058C, 0x00010592},
{0x00010594, 0x00010595}, {0x00010597, 0x000105A1}, {0x000105A3, 0x000105B1}, {0x000105B3, 0x000105B9},
{0x000105BB, 0x000105BC}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767},
{0x00010780, 0x00010785}, {0x00010787, 0x000107B0}, {0x000107B2, 0x000107BA}, {0x00010800, 0x00010805},
{0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, {0x0001083C, 0x0001083C},
{0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, {0x000108E0, 0x000108F2},
{0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, {0x00010980, 0x000109B7},
{0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, {0x00010A15, 0x00010A17},
{0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, {0x00010AC0, 0x00010AC7},
{0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, {0x00010B60, 0x00010B72},
{0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, {0x00010CC0, 0x00010CF2},
{0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, {0x00010F00, 0x00010F1C},
{0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010F70, 0x00010F81}, {0x00010FB0, 0x00010FC4},
{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011071, 0x00011072}, {0x00011075, 0x00011075},
{0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, {0x00011103, 0x00011126}, {0x00011144, 0x00011144},
{0x00011147, 0x00011147}, {0x00011150, 0x00011172}, {0x00011176, 0x00011176}, {0x00011183, 0x000111B2},
{0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, {0x000111DC, 0x000111DC}, {0x00011200, 0x00011211},
{0x00011213, 0x0001122B}, {0x0001123F, 0x00011240}, {0x00011280, 0x00011286}, {0x00011288, 0x00011288},
{0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, {0x000112B0, 0x000112DE},
{0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, {0x0001132A, 0x00011330},
{0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, {0x00011350, 0x00011350},
{0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, {0x0001145F, 0x00011461},
{0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, {0x00011580, 0x000115AE},
{0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, {0x00011680, 0x000116AA},
{0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011740, 0x00011746}, {0x00011800, 0x0001182B},
{0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913},
{0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941},
{0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3},
{0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50},
{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AC0, 0x00011AF8}, {0x00011C00, 0x00011C08},
{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AB0, 0x00011AF8}, {0x00011C00, 0x00011C08},
{0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06},
{0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65},
{0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2},
{0x00011FB0, 0x00011FB0}, {0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00013000, 0x0001342E},
{0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, {0x00016AD0, 0x00016AED},
{0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, {0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F},
{0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, {0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F},
{0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, {0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5},
{0x00018D00, 0x00018D08}, {0x0001B000, 0x0001B11E}, {0x0001B150, 0x0001B152}, {0x0001B164, 0x0001B167},
{0x0001B170, 0x0001B2FB}, {0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88},
{0x0001BC90, 0x0001BC99}, {0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F},
{0x0001D4A2, 0x0001D4A2}, {0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9},
{0x0001D4BB, 0x0001D4BB}, {0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A},
{0x0001D50D, 0x0001D514}, {0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E},
{0x0001D540, 0x0001D544}, {0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5},
{0x0001D6A8, 0x0001D6C0}, {0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714},
{0x0001D716, 0x0001D734}, {0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788},
{0x0001D78A, 0x0001D7A8}, {0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001E100, 0x0001E12C},
{0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, {0x0001E2C0, 0x0001E2EB}, {0x0001E800, 0x0001E8C4},
{0x00011F02, 0x00011F02}, {0x00011F04, 0x00011F10}, {0x00011F12, 0x00011F33}, {0x00011FB0, 0x00011FB0},
{0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00012F90, 0x00012FF0}, {0x00013000, 0x0001342F},
{0x00013441, 0x00013446}, {0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E},
{0x00016A70, 0x00016ABE}, {0x00016AD0, 0x00016AED}, {0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43},
{0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, {0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A},
{0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, {0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3},
{0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, {0x00018D00, 0x00018D08}, {0x0001AFF0, 0x0001AFF3},
{0x0001AFF5, 0x0001AFFB}, {0x0001AFFD, 0x0001AFFE}, {0x0001B000, 0x0001B122}, {0x0001B132, 0x0001B132},
{0x0001B150, 0x0001B152}, {0x0001B155, 0x0001B155}, {0x0001B164, 0x0001B167}, {0x0001B170, 0x0001B2FB},
{0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, {0x0001BC90, 0x0001BC99},
{0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, {0x0001D4A2, 0x0001D4A2},
{0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, {0x0001D4BB, 0x0001D4BB},
{0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, {0x0001D50D, 0x0001D514},
{0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, {0x0001D540, 0x0001D544},
{0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, {0x0001D6A8, 0x0001D6C0},
{0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, {0x0001D716, 0x0001D734},
{0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, {0x0001D78A, 0x0001D7A8},
{0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001DF00, 0x0001DF1E}, {0x0001DF25, 0x0001DF2A},
{0x0001E030, 0x0001E06D}, {0x0001E100, 0x0001E12C}, {0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E},
{0x0001E290, 0x0001E2AD}, {0x0001E2C0, 0x0001E2EB}, {0x0001E4D0, 0x0001E4EB}, {0x0001E7E0, 0x0001E7E6},
{0x0001E7E8, 0x0001E7EB}, {0x0001E7ED, 0x0001E7EE}, {0x0001E7F0, 0x0001E7FE}, {0x0001E800, 0x0001E8C4},
{0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F},
{0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32},
{0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42},
@ -182,15 +211,14 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter = {
{0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62},
{0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77},
{0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B},
{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DD},
{0x0002A700, 0x0002B734}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0},
{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A},
{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DF},
{0x0002A700, 0x0002B739}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0},
{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, {0x00031350, 0x000323AF},
};
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {
{0x00000009, 0x0000000D}, {0x0000001C, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0},
{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F},
{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000},
{0x00000020, 0x00000020}, {0x000000A0, 0x000000A0}, {0x00001680, 0x00001680}, {0x00002000, 0x0000200A},
{0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, {0x0000205F, 0x0000205F}, {0x00003000, 0x00003000},
};
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark = {
@ -200,72 +228,77 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark = {
{0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A},
{0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819},
{0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B},
{0x000008D3, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, {0x0000093E, 0x0000094F},
{0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, {0x000009BC, 0x000009BC},
{0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, {0x000009D7, 0x000009D7},
{0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, {0x00000A3C, 0x00000A3C},
{0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, {0x00000A51, 0x00000A51},
{0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, {0x00000ABC, 0x00000ABC},
{0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, {0x00000AE2, 0x00000AE3},
{0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, {0x00000B3E, 0x00000B44},
{0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, {0x00000B62, 0x00000B63},
{0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, {0x00000BCA, 0x00000BCD},
{0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48},
{0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, {0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83},
{0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, {0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD},
{0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C},
{0x00000D3E, 0x00000D44}, {0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57},
{0x00000D62, 0x00000D63}, {0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4},
{0x00000DD6, 0x00000DD6}, {0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31},
{0x00000E34, 0x00000E3A}, {0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC},
{0x00000EC8, 0x00000ECD}, {0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37},
{0x00000F39, 0x00000F39}, {0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87},
{0x00000F8D, 0x00000F97}, {0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E},
{0x00001056, 0x00001059}, {0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D},
{0x00001071, 0x00001074}, {0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D},
{0x0000135D, 0x0000135F}, {0x00001712, 0x00001714}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753},
{0x00001772, 0x00001773}, {0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D},
{0x00000898, 0x0000089F}, {0x000008CA, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C},
{0x0000093E, 0x0000094F}, {0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983},
{0x000009BC, 0x000009BC}, {0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD},
{0x000009D7, 0x000009D7}, {0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03},
{0x00000A3C, 0x00000A3C}, {0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D},
{0x00000A51, 0x00000A51}, {0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83},
{0x00000ABC, 0x00000ABC}, {0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD},
{0x00000AE2, 0x00000AE3}, {0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C},
{0x00000B3E, 0x00000B44}, {0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57},
{0x00000B62, 0x00000B63}, {0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8},
{0x00000BCA, 0x00000BCD}, {0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3C, 0x00000C3C},
{0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, {0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56},
{0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, {0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4},
{0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, {0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3},
{0x00000CF3, 0x00000CF3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, {0x00000D3E, 0x00000D44},
{0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, {0x00000D62, 0x00000D63},
{0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, {0x00000DD6, 0x00000DD6},
{0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, {0x00000E34, 0x00000E3A},
{0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, {0x00000EC8, 0x00000ECE},
{0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, {0x00000F39, 0x00000F39},
{0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, {0x00000F8D, 0x00000F97},
{0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, {0x00001056, 0x00001059},
{0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, {0x00001071, 0x00001074},
{0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, {0x0000135D, 0x0000135F},
{0x00001712, 0x00001715}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, {0x00001772, 0x00001773},
{0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, {0x0000180F, 0x0000180F},
{0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B},
{0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F},
{0x00001AB0, 0x00001AC0}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73},
{0x00001AB0, 0x00001ACE}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73},
{0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37},
{0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4},
{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DF9}, {0x00001DFB, 0x00001DFF}, {0x000020D0, 0x000020F0},
{0x00002CEF, 0x00002CF1}, {0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F},
{0x00003099, 0x0000309A}, {0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F},
{0x0000A6F0, 0x0000A6F1}, {0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B},
{0x0000A823, 0x0000A827}, {0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5},
{0x0000A8E0, 0x0000A8F1}, {0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953},
{0x0000A980, 0x0000A983}, {0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36},
{0x0000AA43, 0x0000AA43}, {0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0},
{0x0000AAB2, 0x0000AAB4}, {0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1},
{0x0000AAEB, 0x0000AAEF}, {0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED},
{0x0000FB1E, 0x0000FB1E}, {0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD},
{0x000102E0, 0x000102E0}, {0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06},
{0x00010A0C, 0x00010A0F}, {0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6},
{0x00010D24, 0x00010D27}, {0x00010EAB, 0x00010EAC}, {0x00010F46, 0x00010F50}, {0x00011000, 0x00011002},
{0x00011038, 0x00011046}, {0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x00011100, 0x00011102},
{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DFF}, {0x000020D0, 0x000020F0}, {0x00002CEF, 0x00002CF1},
{0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, {0x00003099, 0x0000309A},
{0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, {0x0000A6F0, 0x0000A6F1},
{0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, {0x0000A823, 0x0000A827},
{0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, {0x0000A8E0, 0x0000A8F1},
{0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, {0x0000A980, 0x0000A983},
{0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, {0x0000AA43, 0x0000AA43},
{0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, {0x0000AAB2, 0x0000AAB4},
{0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, {0x0000AAEB, 0x0000AAEF},
{0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, {0x0000FB1E, 0x0000FB1E},
{0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, {0x000102E0, 0x000102E0},
{0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, {0x00010A0C, 0x00010A0F},
{0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, {0x00010D24, 0x00010D27},
{0x00010EAB, 0x00010EAC}, {0x00010EFD, 0x00010EFF}, {0x00010F46, 0x00010F50}, {0x00010F82, 0x00010F85},
{0x00011000, 0x00011002}, {0x00011038, 0x00011046}, {0x00011070, 0x00011070}, {0x00011073, 0x00011074},
{0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x000110C2, 0x000110C2}, {0x00011100, 0x00011102},
{0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182},
{0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237},
{0x0001123E, 0x0001123E}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, {0x0001133B, 0x0001133C},
{0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, {0x00011357, 0x00011357},
{0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, {0x00011435, 0x00011446},
{0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, {0x000115B8, 0x000115C0},
{0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, {0x0001171D, 0x0001172B},
{0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, {0x0001193B, 0x0001193E},
{0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, {0x000119DA, 0x000119E0},
{0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, {0x00011A3B, 0x00011A3E},
{0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, {0x00011C2F, 0x00011C36},
{0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, {0x00011D31, 0x00011D36},
{0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, {0x00011D47, 0x00011D47},
{0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, {0x00011EF3, 0x00011EF6},
{0x00016AF0, 0x00016AF4}, {0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87},
{0x00016F8F, 0x00016F92}, {0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E},
{0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, {0x0001D185, 0x0001D18B},
{0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, {0x0001DA3B, 0x0001DA6C},
{0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, {0x0001DAA1, 0x0001DAAF},
{0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, {0x0001E023, 0x0001E024},
{0x0001E026, 0x0001E02A}, {0x0001E130, 0x0001E136}, {0x0001E2EC, 0x0001E2EF}, {0x0001E8D0, 0x0001E8D6},
{0x0001123E, 0x0001123E}, {0x00011241, 0x00011241}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303},
{0x0001133B, 0x0001133C}, {0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D},
{0x00011357, 0x00011357}, {0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374},
{0x00011435, 0x00011446}, {0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5},
{0x000115B8, 0x000115C0}, {0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7},
{0x0001171D, 0x0001172B}, {0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938},
{0x0001193B, 0x0001193E}, {0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7},
{0x000119DA, 0x000119E0}, {0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39},
{0x00011A3B, 0x00011A3E}, {0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99},
{0x00011C2F, 0x00011C36}, {0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6},
{0x00011D31, 0x00011D36}, {0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45},
{0x00011D47, 0x00011D47}, {0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97},
{0x00011EF3, 0x00011EF6}, {0x00011F00, 0x00011F01}, {0x00011F03, 0x00011F03}, {0x00011F34, 0x00011F3A},
{0x00011F3E, 0x00011F42}, {0x00013440, 0x00013440}, {0x00013447, 0x00013455}, {0x00016AF0, 0x00016AF4},
{0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, {0x00016F8F, 0x00016F92},
{0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, {0x0001CF00, 0x0001CF2D},
{0x0001CF30, 0x0001CF46}, {0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182},
{0x0001D185, 0x0001D18B}, {0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36},
{0x0001DA3B, 0x0001DA6C}, {0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F},
{0x0001DAA1, 0x0001DAAF}, {0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021},
{0x0001E023, 0x0001E024}, {0x0001E026, 0x0001E02A}, {0x0001E08F, 0x0001E08F}, {0x0001E130, 0x0001E136},
{0x0001E2AE, 0x0001E2AE}, {0x0001E2EC, 0x0001E2EF}, {0x0001E4EC, 0x0001E4EF}, {0x0001E8D0, 0x0001E8D6},
{0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF},
};
@ -276,7 +309,7 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation = {
{0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E},
{0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE},
{0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4},
{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061E, 0x0000061F},
{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061D, 0x0000061F},
{0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9},
{0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970},
{0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77},
@ -286,37 +319,38 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation = {
{0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C},
{0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA},
{0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6},
{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001BFC, 0x00001BFF}, {0x00001C3B, 0x00001C3F},
{0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, {0x00002010, 0x00002027},
{0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, {0x0000207D, 0x0000207E},
{0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, {0x00002768, 0x00002775},
{0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, {0x000029D8, 0x000029DB},
{0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, {0x00002D70, 0x00002D70},
{0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E52}, {0x00003001, 0x00003003},
{0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, {0x0000303D, 0x0000303D},
{0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, {0x0000A60D, 0x0000A60F},
{0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, {0x0000A874, 0x0000A877},
{0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, {0x0000A92E, 0x0000A92F},
{0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, {0x0000AA5C, 0x0000AA5F},
{0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, {0x0000FD3E, 0x0000FD3F},
{0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, {0x0000FE63, 0x0000FE63},
{0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, {0x0000FF05, 0x0000FF0A},
{0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, {0x0000FF3B, 0x0000FF3D},
{0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, {0x0000FF5F, 0x0000FF65},
{0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, {0x0001056F, 0x0001056F},
{0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, {0x00010A50, 0x00010A58},
{0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, {0x00010B99, 0x00010B9C},
{0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC},
{0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, {0x00011174, 0x00011175}, {0x000111C5, 0x000111C8},
{0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, {0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D},
{0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, {0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D},
{0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, {0x00011641, 0x00011643}, {0x00011660, 0x0001166C},
{0x0001173C, 0x0001173E}, {0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2},
{0x00011A3F, 0x00011A46}, {0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011C41, 0x00011C45},
{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011FFF, 0x00011FFF}, {0x00012470, 0x00012474},
{0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, {0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44},
{0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, {0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B},
{0x0001E95E, 0x0001E95F},
{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001B7D, 0x00001B7E}, {0x00001BFC, 0x00001BFF},
{0x00001C3B, 0x00001C3F}, {0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3},
{0x00002010, 0x00002027}, {0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E},
{0x0000207D, 0x0000207E}, {0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A},
{0x00002768, 0x00002775}, {0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998},
{0x000029D8, 0x000029DB}, {0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF},
{0x00002D70, 0x00002D70}, {0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E5D},
{0x00003001, 0x00003003}, {0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030},
{0x0000303D, 0x0000303D}, {0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF},
{0x0000A60D, 0x0000A60F}, {0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7},
{0x0000A874, 0x0000A877}, {0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC},
{0x0000A92E, 0x0000A92F}, {0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF},
{0x0000AA5C, 0x0000AA5F}, {0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB},
{0x0000FD3E, 0x0000FD3F}, {0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61},
{0x0000FE63, 0x0000FE63}, {0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03},
{0x0000FF05, 0x0000FF0A}, {0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20},
{0x0000FF3B, 0x0000FF3D}, {0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D},
{0x0000FF5F, 0x0000FF65}, {0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0},
{0x0001056F, 0x0001056F}, {0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F},
{0x00010A50, 0x00010A58}, {0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F},
{0x00010B99, 0x00010B9C}, {0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00010F86, 0x00010F89},
{0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, {0x000110BE, 0x000110C1}, {0x00011140, 0x00011143},
{0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, {0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB},
{0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, {0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F},
{0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, {0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7},
{0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, {0x000116B9, 0x000116B9}, {0x0001173C, 0x0001173E},
{0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, {0x00011A3F, 0x00011A46},
{0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011B00, 0x00011B09}, {0x00011C41, 0x00011C45},
{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011F43, 0x00011F4F}, {0x00011FFF, 0x00011FFF},
{0x00012470, 0x00012474}, {0x00012FF1, 0x00012FF2}, {0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5},
{0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, {0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2},
{0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, {0x0001E95E, 0x0001E95F},
};
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol = {
@ -328,40 +362,41 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol = {
{0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482},
{0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F},
{0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6},
{0x000007FE, 0x000007FF}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, {0x00000AF1, 0x00000AF1},
{0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, {0x00000D4F, 0x00000D4F},
{0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, {0x00000F13, 0x00000F13},
{0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, {0x00000F36, 0x00000F36},
{0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, {0x00000FCE, 0x00000FCF},
{0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, {0x0000166D, 0x0000166D},
{0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, {0x00001B61, 0x00001B6A},
{0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, {0x00001FCD, 0x00001FCF},
{0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, {0x00002044, 0x00002044},
{0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, {0x000020A0, 0x000020BF},
{0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, {0x00002114, 0x00002114},
{0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, {0x00002127, 0x00002127},
{0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, {0x00002140, 0x00002144},
{0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, {0x00002190, 0x00002307},
{0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, {0x0000249C, 0x000024E9},
{0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, {0x000027F0, 0x00002982},
{0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, {0x00002B76, 0x00002B95},
{0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, {0x00002E80, 0x00002E99},
{0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, {0x00003004, 0x00003004},
{0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, {0x0000303E, 0x0000303F},
{0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, {0x000031C0, 0x000031E3},
{0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, {0x00003260, 0x0000327F},
{0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6},
{0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B},
{0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B},
{0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC1}, {0x0000FDFC, 0x0000FDFD}, {0x0000FE62, 0x0000FE62},
{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B},
{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C},
{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD},
{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C},
{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8},
{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45},
{0x0001BC9C, 0x0001BC9C}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164},
{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1E8},
{0x000007FE, 0x000007FF}, {0x00000888, 0x00000888}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB},
{0x00000AF1, 0x00000AF1}, {0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F},
{0x00000D4F, 0x00000D4F}, {0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03},
{0x00000F13, 0x00000F13}, {0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34},
{0x00000F36, 0x00000F36}, {0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC},
{0x00000FCE, 0x00000FCF}, {0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399},
{0x0000166D, 0x0000166D}, {0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF},
{0x00001B61, 0x00001B6A}, {0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1},
{0x00001FCD, 0x00001FCF}, {0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE},
{0x00002044, 0x00002044}, {0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C},
{0x000020A0, 0x000020C0}, {0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109},
{0x00002114, 0x00002114}, {0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125},
{0x00002127, 0x00002127}, {0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B},
{0x00002140, 0x00002144}, {0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B},
{0x00002190, 0x00002307}, {0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A},
{0x0000249C, 0x000024E9}, {0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5},
{0x000027F0, 0x00002982}, {0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73},
{0x00002B76, 0x00002B95}, {0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51},
{0x00002E80, 0x00002E99}, {0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB},
{0x00003004, 0x00003004}, {0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037},
{0x0000303E, 0x0000303F}, {0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F},
{0x000031C0, 0x000031E3}, {0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250},
{0x00003260, 0x0000327F}, {0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF},
{0x0000A490, 0x0000A4C6}, {0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A},
{0x0000A828, 0x0000A82B}, {0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B},
{0x0000AB6A, 0x0000AB6B}, {0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC2}, {0x0000FD40, 0x0000FD4F},
{0x0000FDCF, 0x0000FDCF}, {0x0000FDFC, 0x0000FDFF}, {0x0000FE62, 0x0000FE62}, {0x0000FE64, 0x0000FE66},
{0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, {0x0000FF1C, 0x0000FF1E},
{0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, {0x0000FF5E, 0x0000FF5E},
{0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, {0x00010137, 0x0001013F},
{0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, {0x000101A0, 0x000101A0},
{0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, {0x0001173F, 0x0001173F},
{0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, {0x0001BC9C, 0x0001BC9C},
{0x0001CF50, 0x0001CFC3}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164},
{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1EA},
{0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, {0x0001D6C1, 0x0001D6C1},
{0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, {0x0001D735, 0x0001D735},
{0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, {0x0001D7A9, 0x0001D7A9},
@ -371,102 +406,100 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol = {
{0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, {0x0001F0B1, 0x0001F0BF},
{0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, {0x0001F1E6, 0x0001F202},
{0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, {0x0001F260, 0x0001F265},
{0x0001F300, 0x0001F6D7}, {0x0001F6E0, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F773},
{0x0001F780, 0x0001F7D8}, {0x0001F7E0, 0x0001F7EB}, {0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847},
{0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1},
{0x0001F900, 0x0001F978}, {0x0001F97A, 0x0001F9CB}, {0x0001F9CD, 0x0001FA53}, {0x0001FA60, 0x0001FA6D},
{0x0001FA70, 0x0001FA74}, {0x0001FA78, 0x0001FA7A}, {0x0001FA80, 0x0001FA86}, {0x0001FA90, 0x0001FAA8},
{0x0001FAB0, 0x0001FAB6}, {0x0001FAC0, 0x0001FAC2}, {0x0001FAD0, 0x0001FAD6}, {0x0001FB00, 0x0001FB92},
{0x0001FB94, 0x0001FBCA},
{0x0001F300, 0x0001F6D7}, {0x0001F6DC, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F776},
{0x0001F77B, 0x0001F7D9}, {0x0001F7E0, 0x0001F7EB}, {0x0001F7F0, 0x0001F7F0}, {0x0001F800, 0x0001F80B},
{0x0001F810, 0x0001F847}, {0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD},
{0x0001F8B0, 0x0001F8B1}, {0x0001F900, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, {0x0001FA70, 0x0001FA7C},
{0x0001FA80, 0x0001FA88}, {0x0001FA90, 0x0001FABD}, {0x0001FABF, 0x0001FAC5}, {0x0001FACE, 0x0001FADB},
{0x0001FAE0, 0x0001FAE8}, {0x0001FAF0, 0x0001FAF8}, {0x0001FB00, 0x0001FB92}, {0x0001FB94, 0x0001FBCA},
};
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control = {
{0x00000000, 0x00000008}, {0x0000000E, 0x0000001B}, {0x0000007F, 0x00000084}, {0x00000086, 0x0000009F},
{0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, {0x00000380, 0x00000383}, {0x0000038B, 0x0000038B},
{0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, {0x00000530, 0x00000530}, {0x00000557, 0x00000558},
{0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, {0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE},
{0x000005F5, 0x00000605}, {0x0000061C, 0x0000061D}, {0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F},
{0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, {0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F},
{0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, {0x0000085F, 0x0000085F}, {0x0000086B, 0x0000089F},
{0x000008B5, 0x000008B5}, {0x000008C8, 0x000008D2}, {0x000008E2, 0x000008E2}, {0x00000984, 0x00000984},
{0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, {0x000009B1, 0x000009B1},
{0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, {0x000009C9, 0x000009CA},
{0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, {0x000009E4, 0x000009E5},
{0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, {0x00000A11, 0x00000A12},
{0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, {0x00000A37, 0x00000A37},
{0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, {0x00000A49, 0x00000A4A},
{0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, {0x00000A5F, 0x00000A65},
{0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, {0x00000A92, 0x00000A92},
{0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, {0x00000ABA, 0x00000ABB},
{0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, {0x00000AD1, 0x00000ADF},
{0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, {0x00000B04, 0x00000B04},
{0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, {0x00000B31, 0x00000B31},
{0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, {0x00000B49, 0x00000B4A},
{0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, {0x00000B64, 0x00000B65},
{0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, {0x00000B91, 0x00000B91},
{0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, {0x00000BA0, 0x00000BA2},
{0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, {0x00000BC3, 0x00000BC5},
{0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, {0x00000BD8, 0x00000BE5},
{0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, {0x00000C29, 0x00000C29},
{0x00000C3A, 0x00000C3C}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, {0x00000C4E, 0x00000C54},
{0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5F}, {0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76},
{0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, {0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4},
{0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, {0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4},
{0x00000CD7, 0x00000CDD}, {0x00000CDF, 0x00000CDF}, {0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0},
{0x00000CF3, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, {0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45},
{0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, {0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80},
{0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, {0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC},
{0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, {0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5},
{0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, {0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00},
{0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, {0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85},
{0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, {0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF},
{0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, {0x00000ECE, 0x00000ECF}, {0x00000EDA, 0x00000EDB},
{0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, {0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98},
{0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, {0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6},
{0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, {0x00001249, 0x00001249}, {0x0000124E, 0x0000124F},
{0x00001257, 0x00001257}, {0x00001259, 0x00001259}, {0x0000125E, 0x0000125F}, {0x00001289, 0x00001289},
{0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, {0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF},
{0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, {0x000012D7, 0x000012D7}, {0x00001311, 0x00001311},
{0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, {0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F},
{0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, {0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF},
{0x0000170D, 0x0000170D}, {0x00001715, 0x0000171F}, {0x00001737, 0x0000173F}, {0x00001754, 0x0000175F},
{0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, {0x000017DE, 0x000017DF},
{0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180F}, {0x0000181A, 0x0000181F},
{0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, {0x0000191F, 0x0000191F},
{0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, {0x0000196E, 0x0000196F},
{0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, {0x000019DB, 0x000019DD},
{0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, {0x00001A8A, 0x00001A8F},
{0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001AC1, 0x00001AFF}, {0x00001B4C, 0x00001B4F},
{0x00001B7D, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, {0x00001C4A, 0x00001C4C},
{0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, {0x00001CFB, 0x00001CFF},
{0x00001DFA, 0x00001DFA}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47},
{0x00000000, 0x0000001F}, {0x0000007F, 0x0000009F}, {0x000000AD, 0x000000AD}, {0x00000378, 0x00000379},
{0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, {0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2},
{0x00000530, 0x00000530}, {0x00000557, 0x00000558}, {0x0000058B, 0x0000058C}, {0x00000590, 0x00000590},
{0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, {0x000005F5, 0x00000605}, {0x0000061C, 0x0000061C},
{0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, {0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF},
{0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, {0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D},
{0x0000085F, 0x0000085F}, {0x0000086B, 0x0000086F}, {0x0000088F, 0x00000897}, {0x000008E2, 0x000008E2},
{0x00000984, 0x00000984}, {0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9},
{0x000009B1, 0x000009B1}, {0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6},
{0x000009C9, 0x000009CA}, {0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE},
{0x000009E4, 0x000009E5}, {0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E},
{0x00000A11, 0x00000A12}, {0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34},
{0x00000A37, 0x00000A37}, {0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46},
{0x00000A49, 0x00000A4A}, {0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D},
{0x00000A5F, 0x00000A65}, {0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E},
{0x00000A92, 0x00000A92}, {0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4},
{0x00000ABA, 0x00000ABB}, {0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF},
{0x00000AD1, 0x00000ADF}, {0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00},
{0x00000B04, 0x00000B04}, {0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29},
{0x00000B31, 0x00000B31}, {0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46},
{0x00000B49, 0x00000B4A}, {0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E},
{0x00000B64, 0x00000B65}, {0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D},
{0x00000B91, 0x00000B91}, {0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D},
{0x00000BA0, 0x00000BA2}, {0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD},
{0x00000BC3, 0x00000BC5}, {0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6},
{0x00000BD8, 0x00000BE5}, {0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11},
{0x00000C29, 0x00000C29}, {0x00000C3A, 0x00000C3B}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49},
{0x00000C4E, 0x00000C54}, {0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5C}, {0x00000C5E, 0x00000C5F},
{0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, {0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91},
{0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, {0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5},
{0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, {0x00000CD7, 0x00000CDC}, {0x00000CDF, 0x00000CDF},
{0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, {0x00000CF4, 0x00000CFF}, {0x00000D0D, 0x00000D0D},
{0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, {0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53},
{0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, {0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99},
{0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, {0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9},
{0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, {0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5},
{0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, {0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80},
{0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, {0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4},
{0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, {0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7},
{0x00000ECF, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, {0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48},
{0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, {0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD},
{0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, {0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF},
{0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, {0x00001257, 0x00001257}, {0x00001259, 0x00001259},
{0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, {0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1},
{0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, {0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7},
{0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, {0x00001316, 0x00001317}, {0x0000135B, 0x0000135C},
{0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, {0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF},
{0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, {0x00001716, 0x0000171E}, {0x00001737, 0x0000173F},
{0x00001754, 0x0000175F}, {0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F},
{0x000017DE, 0x000017DF}, {0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180E},
{0x0000181A, 0x0000181F}, {0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF},
{0x0000191F, 0x0000191F}, {0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943},
{0x0000196E, 0x0000196F}, {0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF},
{0x000019DB, 0x000019DD}, {0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E},
{0x00001A8A, 0x00001A8F}, {0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001ACF, 0x00001AFF},
{0x00001B4D, 0x00001B4F}, {0x00001B7F, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A},
{0x00001C4A, 0x00001C4C}, {0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF},
{0x00001CFB, 0x00001CFF}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47},
{0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C},
{0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5},
{0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5},
{0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F},
{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C0, 0x000020CF},
{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C1, 0x000020CF},
{0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F},
{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002C2F, 0x00002C2F}, {0x00002C5F, 0x00002C5F},
{0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, {0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F},
{0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, {0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7},
{0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, {0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7},
{0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, {0x00002DDF, 0x00002DDF}, {0x00002E53, 0x00002E7F},
{0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, {0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF},
{0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, {0x00003130, 0x00003130},
{0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, {0x0000321F, 0x0000321F}, {0x00009FFD, 0x00009FFF},
{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF},
{0x0000A7C0, 0x0000A7C1}, {0x0000A7CB, 0x0000A7F4}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F},
{0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E},
{0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF},
{0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA},
{0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F},
{0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF},
{0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF},
{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26},
{0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, {0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E},
{0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, {0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7},
{0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, {0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7},
{0x00002DDF, 0x00002DDF}, {0x00002E5E, 0x00002E7F}, {0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF},
{0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, {0x00003040, 0x00003040}, {0x00003097, 0x00003098},
{0x00003100, 0x00003104}, {0x00003130, 0x00003130}, {0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF},
{0x0000321F, 0x0000321F}, {0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F},
{0x0000A6F8, 0x0000A6FF}, {0x0000A7CB, 0x0000A7CF}, {0x0000A7D2, 0x0000A7D2}, {0x0000A7D4, 0x0000A7D4},
{0x0000A7DA, 0x0000A7F1}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, {0x0000A878, 0x0000A87F},
{0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, {0x0000A97D, 0x0000A97F},
{0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, {0x0000AA37, 0x0000AA3F},
{0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, {0x0000AAF7, 0x0000AB00},
{0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, {0x0000AB27, 0x0000AB27},
{0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, {0x0000ABFA, 0x0000ABFF},
{0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000D7FF}, {0x0000E000, 0x0000F8FF},
{0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, {0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C},
{0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, {0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42},
{0x0000FB45, 0x0000FB45}, {0x0000FBC2, 0x0000FBD2}, {0x0000FD40, 0x0000FD4F}, {0x0000FD90, 0x0000FD91},
{0x0000FDC8, 0x0000FDEF}, {0x0000FDFE, 0x0000FDFF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53},
{0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FF00},
{0x0000FB45, 0x0000FB45}, {0x0000FBC3, 0x0000FBD2}, {0x0000FD90, 0x0000FD91}, {0x0000FDC8, 0x0000FDCE},
{0x0000FDD0, 0x0000FDEF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, {0x0000FE67, 0x0000FE67},
{0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FEFE}, {0x0000FF00, 0x0000FF00},
{0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, {0x0000FFD8, 0x0000FFD9},
{0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, {0x0000FFFE, 0x0000FFFF},
{0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, {0x0001003E, 0x0001003E},
@ -476,82 +509,91 @@ const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control = {
{0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, {0x0001039E, 0x0001039E},
{0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, {0x000104AA, 0x000104AF},
{0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, {0x00010564, 0x0001056E},
{0x00010570, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x000107FF},
{0x00010806, 0x00010807}, {0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B},
{0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF},
{0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E},
{0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04},
{0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37},
{0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF},
{0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57},
{0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF},
{0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F},
{0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF},
{0x00010EB2, 0x00010EFF}, {0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010FAF}, {0x00010FCC, 0x00010FDF},
{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011070, 0x0001107E}, {0x000110BD, 0x000110BD},
{0x000110C2, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135},
{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF},
{0x00011212, 0x00011212}, {0x0001123F, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289},
{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF},
{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312},
{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A},
{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356},
{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF},
{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F},
{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F},
{0x0001166D, 0x0001167F}, {0x000116B9, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C},
{0x0001172C, 0x0001172F}, {0x00011740, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE},
{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917},
{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F},
{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F},
{0x00011AA3, 0x00011ABF}, {0x00011AF9, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37},
{0x0001057B, 0x0001057B}, {0x0001058B, 0x0001058B}, {0x00010593, 0x00010593}, {0x00010596, 0x00010596},
{0x000105A2, 0x000105A2}, {0x000105B2, 0x000105B2}, {0x000105BA, 0x000105BA}, {0x000105BD, 0x000105FF},
{0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x0001077F}, {0x00010786, 0x00010786},
{0x000107B1, 0x000107B1}, {0x000107BB, 0x000107FF}, {0x00010806, 0x00010807}, {0x00010809, 0x00010809},
{0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, {0x0001083D, 0x0001083E}, {0x00010856, 0x00010856},
{0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, {0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA},
{0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, {0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB},
{0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, {0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14},
{0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, {0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F},
{0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, {0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF},
{0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, {0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98},
{0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, {0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF},
{0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, {0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F},
{0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, {0x00010EB2, 0x00010EFC}, {0x00010F28, 0x00010F2F},
{0x00010F5A, 0x00010F6F}, {0x00010F8A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, {0x00010FF7, 0x00010FFF},
{0x0001104E, 0x00011051}, {0x00011076, 0x0001107E}, {0x000110BD, 0x000110BD}, {0x000110C3, 0x000110CF},
{0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, {0x00011148, 0x0001114F},
{0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, {0x00011212, 0x00011212},
{0x00011242, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, {0x0001128E, 0x0001128E},
{0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, {0x000112FA, 0x000112FF},
{0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, {0x00011329, 0x00011329},
{0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, {0x00011345, 0x00011346},
{0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, {0x00011358, 0x0001135C},
{0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, {0x0001145C, 0x0001145C},
{0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, {0x000115B6, 0x000115B7},
{0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, {0x0001166D, 0x0001167F},
{0x000116BA, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, {0x0001172C, 0x0001172F},
{0x00011747, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, {0x00011907, 0x00011908},
{0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, {0x00011936, 0x00011936},
{0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, {0x000119A8, 0x000119A9},
{0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, {0x00011AA3, 0x00011AAF},
{0x00011AF9, 0x00011AFF}, {0x00011B0A, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37},
{0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, {0x00011CA8, 0x00011CA8},
{0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, {0x00011D37, 0x00011D39},
{0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, {0x00011D5A, 0x00011D5F},
{0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, {0x00011D92, 0x00011D92},
{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011FAF}, {0x00011FB1, 0x00011FBF},
{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F},
{0x00012544, 0x00012FFF}, {0x0001342F, 0x000143FF}, {0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F},
{0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016A70, 0x00016ACF}, {0x00016AEE, 0x00016AEF},
{0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62},
{0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E},
{0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF},
{0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFFF}, {0x0001B11F, 0x0001B14F},
{0x0001B153, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F},
{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CFFF},
{0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1E9, 0x0001D1FF},
{0x0001D246, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF},
{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4},
{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC},
{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515},
{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545},
{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD},
{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DFFF}, {0x0001E007, 0x0001E007},
{0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E0FF},
{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E2BF},
{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF},
{0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00},
{0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23},
{0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38},
{0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48},
{0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53},
{0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C},
{0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66},
{0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D},
{0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4},
{0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F},
{0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0},
{0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F},
{0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DF},
{0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F774, 0x0001F77F}, {0x0001F7D9, 0x0001F7DF},
{0x0001F7EC, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F},
{0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001F979, 0x0001F979},
{0x0001F9CC, 0x0001F9CC}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA75, 0x0001FA77},
{0x0001FA7B, 0x0001FA7F}, {0x0001FA87, 0x0001FA8F}, {0x0001FAA9, 0x0001FAAF}, {0x0001FAB7, 0x0001FABF},
{0x0001FAC3, 0x0001FACF}, {0x0001FAD7, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF},
{0x0001FBFA, 0x0001FFFF}, {0x0002A6DE, 0x0002A6FF}, {0x0002B735, 0x0002B73F}, {0x0002B81E, 0x0002B81F},
{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x000E00FF},
{0x000E01F0, 0x0010FFFF},
{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011EFF}, {0x00011F11, 0x00011F11},
{0x00011F3B, 0x00011F3D}, {0x00011F5A, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, {0x00011FF2, 0x00011FFE},
{0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, {0x00012544, 0x00012F8F},
{0x00012FF3, 0x00012FFF}, {0x00013430, 0x0001343F}, {0x00013456, 0x000143FF}, {0x00014647, 0x000167FF},
{0x00016A39, 0x00016A3F}, {0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016ABF, 0x00016ABF},
{0x00016ACA, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, {0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F},
{0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, {0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F},
{0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, {0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF},
{0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, {0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF},
{0x00018D09, 0x0001AFEF}, {0x0001AFF4, 0x0001AFF4}, {0x0001AFFC, 0x0001AFFC}, {0x0001AFFF, 0x0001AFFF},
{0x0001B123, 0x0001B131}, {0x0001B133, 0x0001B14F}, {0x0001B153, 0x0001B154}, {0x0001B156, 0x0001B163},
{0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, {0x0001BC7D, 0x0001BC7F},
{0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CEFF}, {0x0001CF2E, 0x0001CF2F},
{0x0001CF47, 0x0001CF4F}, {0x0001CFC4, 0x0001CFFF}, {0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128},
{0x0001D173, 0x0001D17A}, {0x0001D1EB, 0x0001D1FF}, {0x0001D246, 0x0001D2BF}, {0x0001D2D4, 0x0001D2DF},
{0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, {0x0001D455, 0x0001D455},
{0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, {0x0001D4A7, 0x0001D4A8},
{0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, {0x0001D4C4, 0x0001D4C4},
{0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, {0x0001D51D, 0x0001D51D},
{0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, {0x0001D547, 0x0001D549},
{0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, {0x0001DA8C, 0x0001DA9A},
{0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DEFF}, {0x0001DF1F, 0x0001DF24}, {0x0001DF2B, 0x0001DFFF},
{0x0001E007, 0x0001E007}, {0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025},
{0x0001E02B, 0x0001E02F}, {0x0001E06E, 0x0001E08E}, {0x0001E090, 0x0001E0FF}, {0x0001E12D, 0x0001E12F},
{0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E28F}, {0x0001E2AF, 0x0001E2BF},
{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E4CF}, {0x0001E4FA, 0x0001E7DF}, {0x0001E7E7, 0x0001E7E7},
{0x0001E7EC, 0x0001E7EC}, {0x0001E7EF, 0x0001E7EF}, {0x0001E7FF, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6},
{0x0001E8D7, 0x0001E8FF}, {0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70},
{0x0001ECB5, 0x0001ED00}, {0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20},
{0x0001EE23, 0x0001EE23}, {0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33},
{0x0001EE38, 0x0001EE38}, {0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46},
{0x0001EE48, 0x0001EE48}, {0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50},
{0x0001EE53, 0x0001EE53}, {0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A},
{0x0001EE5C, 0x0001EE5C}, {0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63},
{0x0001EE65, 0x0001EE66}, {0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78},
{0x0001EE7D, 0x0001EE7D}, {0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0},
{0x0001EEA4, 0x0001EEA4}, {0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF},
{0x0001F02C, 0x0001F02F}, {0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0},
{0x0001F0D0, 0x0001F0D0}, {0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F},
{0x0001F23C, 0x0001F23F}, {0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF},
{0x0001F6D8, 0x0001F6DB}, {0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F777, 0x0001F77A},
{0x0001F7DA, 0x0001F7DF}, {0x0001F7EC, 0x0001F7EF}, {0x0001F7F1, 0x0001F7FF}, {0x0001F80C, 0x0001F80F},
{0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, {0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF},
{0x0001F8B2, 0x0001F8FF}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA7D, 0x0001FA7F},
{0x0001FA89, 0x0001FA8F}, {0x0001FABE, 0x0001FABE}, {0x0001FAC6, 0x0001FACD}, {0x0001FADC, 0x0001FADF},
{0x0001FAE9, 0x0001FAEF}, {0x0001FAF9, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF},
{0x0001FBFA, 0x0001FFFF}, {0x0002A6E0, 0x0002A6FF}, {0x0002B73A, 0x0002B73F}, {0x0002B81E, 0x0002B81F},
{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x0003134F},
{0x000323B0, 0x000E00FF}, {0x000E01F0, 0x0010FFFF},
};
const std::multimap<uint32_t, uint32_t> unicode_map_nfd = {

View File

@ -5,7 +5,7 @@
#include <utility>
#include <vector>
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;

View File

@ -110,9 +110,9 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
std::unordered_map<uint32_t, int> cpt_types;
for (auto p : unicode_ranges_digit) {
for (auto p : unicode_ranges_number) {
for (auto i = p.first; i <= p.second; ++ i) {
cpt_types[i] = CODEPOINT_TYPE_DIGIT;
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
}
}
for (auto p : unicode_ranges_letter) {
@ -300,13 +300,13 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
collecting_letter = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
collecting_numeric = true;
collecting = true;
}
else if (
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
) {
collecting_special = true;
collecting = true;
@ -323,13 +323,13 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
split_condition = true;
}
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
split_condition = true;
}
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true;
}
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
split_condition = true;
}
}
@ -524,19 +524,19 @@ char32_t unicode_tolower(char32_t cp) {
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
{ "\\p{N}", CODEPOINT_TYPE_DIGIT },
{ "\\p{N}", CODEPOINT_TYPE_NUMBER },
{ "\\p{L}", CODEPOINT_TYPE_LETTER },
{ "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
};
static const std::map<int, int> k_ucat_cpt = {
{ CODEPOINT_TYPE_DIGIT, 0xD1 },
{ CODEPOINT_TYPE_NUMBER, 0xD1 },
{ CODEPOINT_TYPE_LETTER, 0xD2 },
{ CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
};
static const std::map<int, std::string> k_ucat_map = {
{ CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9
{ CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
{ CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
{ CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
};

View File

@ -5,7 +5,7 @@
#include <vector>
#define CODEPOINT_TYPE_UNIDENTIFIED 0
#define CODEPOINT_TYPE_DIGIT 1
#define CODEPOINT_TYPE_NUMBER 1
#define CODEPOINT_TYPE_LETTER 2
#define CODEPOINT_TYPE_WHITESPACE 3
#define CODEPOINT_TYPE_ACCENT_MARK 4