mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
wpm : portable unicode tolower (#6305)
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
This commit is contained in:
parent
557410b8f0
commit
32c8486e1f
@ -1170,6 +1170,7 @@ add_library(llama
|
|||||||
llama.h
|
llama.h
|
||||||
unicode.h
|
unicode.h
|
||||||
unicode.cpp
|
unicode.cpp
|
||||||
|
unicode-data.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
|
5
Makefile
5
Makefile
@ -678,7 +678,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|||||||
unicode.o: unicode.cpp unicode.h
|
unicode.o: unicode.cpp unicode.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||||
|
|
||||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
@ -32,6 +32,7 @@ let package = Package(
|
|||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"unicode.cpp",
|
"unicode.cpp",
|
||||||
|
"unicode-data.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"ggml-backend.c",
|
"ggml-backend.c",
|
||||||
"ggml-quants.c",
|
"ggml-quants.c",
|
||||||
|
15
build.zig
15
build.zig
@ -116,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
|
|||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
const unicode = make.obj("unicode", "unicode.cpp");
|
const unicode = make.obj("unicode", "unicode.cpp");
|
||||||
|
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
@ -127,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
|
|||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
22
llama.cpp
22
llama.cpp
@ -61,6 +61,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cctype>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
@ -71,7 +72,6 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cwctype>
|
|
||||||
#include <forward_list>
|
#include <forward_list>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
|
|||||||
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
code = to_lower(code);
|
code = unicode_tolower(code);
|
||||||
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
||||||
code = ' ';
|
code = ' ';
|
||||||
}
|
}
|
||||||
@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
|
|||||||
std::vector<std::string> words;
|
std::vector<std::string> words;
|
||||||
while (r < new_str.size()) {
|
while (r < new_str.size()) {
|
||||||
// if is whitespace
|
// if is whitespace
|
||||||
if (isspace(new_str[r])) {
|
if (isspace(new_str[r], std::locale::classic())) {
|
||||||
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
||||||
l = r + 1;
|
l = r + 1;
|
||||||
r = l;
|
r = l;
|
||||||
@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
|
|||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t to_lower(uint32_t code) {
|
|
||||||
static const std::locale locale("en_US.UTF-8");
|
|
||||||
#if defined(_WIN32)
|
|
||||||
if (code > 0xFFFF) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return std::tolower(wchar_t(code), locale);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_ascii_punct(uint32_t code) {
|
bool is_ascii_punct(uint32_t code) {
|
||||||
return code < 256 && ispunct(code);
|
if (code > 0xFF) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto c = char(static_cast<unsigned char>(code));
|
||||||
|
return ispunct(c, std::locale::classic());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_chinese_char(uint32_t cpt) {
|
bool is_chinese_char(uint32_t cpt) {
|
||||||
|
1651
unicode-data.cpp
Normal file
1651
unicode-data.cpp
Normal file
File diff suppressed because it is too large
Load Diff
16
unicode-data.h
Normal file
16
unicode-data.h
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <map>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
||||||
|
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||||
|
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
1411
unicode.cpp
1411
unicode.cpp
File diff suppressed because it is too large
Load Diff
@ -24,3 +24,5 @@ int unicode_cpt_type(const std::string & utf8);
|
|||||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
|
|
||||||
|
// simple tolower that only implements one-to-one mapping, not one-to-many
|
||||||
|
char32_t unicode_tolower(char32_t cp);
|
||||||
|
Loading…
Reference in New Issue
Block a user