From 549279d8049d78620a2b081e26edb654f83c3bbd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 3 Jun 2024 08:34:43 +0300 Subject: [PATCH] llama : avoid double token-to-piece cache (#7654) ggml-ci --- llama.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 841be1de7..e90da793c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2164,8 +2164,7 @@ struct llama_vocab { std::vector id_to_token; std::vector cache_special_tokens; - std::vector cache_token_to_piece; // llama_token_to_piece(special = false); - std::vector cache_token_to_piece_special; // llama_token_to_piece(special = true); + std::vector cache_token_to_piece; // llama_token_to_piece(special = true); std::map, int> bpe_ranks; @@ -4845,23 +4844,19 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); } - // build token to piece caches + // build token to piece cache { size_t size_cache = 0; - std::vector cache_token_to_piece (n_vocab); - std::vector cache_token_to_piece_special(n_vocab); + std::vector cache_token_to_piece(n_vocab); for (uint32_t id = 0; id < n_vocab; ++id) { - cache_token_to_piece[id] = llama_token_to_piece(&model, id, false); - cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true); + cache_token_to_piece[id] = llama_token_to_piece(&model, id, true); size_cache += cache_token_to_piece[id].size(); - size_cache += cache_token_to_piece_special[id].size(); } - std::swap(vocab.cache_token_to_piece, cache_token_to_piece); - std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special); + std::swap(vocab.cache_token_to_piece, cache_token_to_piece); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } @@ -18318,9 +18313,14 @@ static std::string llama_decode_text(const std::string & text) { // does not write null-terminator to buf int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { + // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 + if (!special && llama_is_control_token(model->vocab, token)) { + return 0; + } + // if we have a cache - use it { - const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece; + const auto & cache = model->vocab.cache_token_to_piece; if (!cache.empty()) { const auto & res = cache.at(token);