mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
b43272afa2
* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
21 lines
582 B
C++
21 lines
582 B
C++
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
|
|
struct range_nfd {
|
|
uint32_t first;
|
|
uint32_t last;
|
|
uint32_t nfd;
|
|
};
|
|
|
|
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
|
|
|
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
|
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
|
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
|
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
|
extern const std::vector<range_nfd> unicode_ranges_nfd;
|