mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
llama : handle unknown utf8 bytes (#7588)
This commit is contained in:
parent
271ff3fc44
commit
8b99e2aa66
11
llama.cpp
11
llama.cpp
@ -17940,7 +17940,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|||||||
|
|
||||||
const auto cpts = unicode_cpts_from_utf8(text);
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
for (const auto cpt : cpts) {
|
for (const auto cpt : cpts) {
|
||||||
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
||||||
|
try {
|
||||||
|
decoded_text += unicode_utf8_to_byte(utf8);
|
||||||
|
} catch (const std::out_of_range & e) {
|
||||||
|
decoded_text += "[UNK_BYTE_0x";
|
||||||
|
for (const auto c : utf8) {
|
||||||
|
decoded_text += format("%02x", (uint8_t) c);
|
||||||
|
}
|
||||||
|
decoded_text += text + "]";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return decoded_text;
|
return decoded_text;
|
||||||
|
Loading…
Reference in New Issue
Block a user