llama : handle unknown utf8 bytes (#7588)

2024-12-26 03:14:35 +00:00 · 2024-05-28 13:55:35 +03:00 · 2024-05-28 13:55:35 +03:00 · 8b99e2aa66
commit 8b99e2aa66
parent 271ff3fc44
1 changed files with 10 additions and 1 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -17940,7 +17940,16 @@ static std::string llama_decode_text(const std::string & text) {
    const auto cpts = unicode_cpts_from_utf8(text);
    for (const auto cpt : cpts) {
-        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
+        const auto utf8 = unicode_cpt_to_utf8(cpt);
        try {
            decoded_text += unicode_utf8_to_byte(utf8);
        } catch (const std::out_of_range & e) {
            decoded_text += "[UNK_BYTE_0x";
            for (const auto c : utf8) {
                decoded_text += format("%02x", (uint8_t) c);
            }
            decoded_text += text + "]";
        }
    }
    return decoded_text;