py : fix missing added_tokens_dict for SPM vocab

This commit is contained in:
Georgi Gerganov 2024-01-16 13:38:54 +02:00
parent a0b3ac8c48
commit 9b464b4e81
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -466,6 +466,7 @@ class SentencePieceVocab: # LlaMa
) )
# Token pieces that were added to the base vocabulary. # Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)