diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5fba01714..9986ce9de 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -73,6 +73,8 @@ class Keys: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" @@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d87bd8e88..26724bf94 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -414,6 +414,12 @@ class GGUFWriter: def add_pad_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.PAD_ID, id) + def add_cls_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.CLS_ID, id) + + def add_mask_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MASK_ID, id) + def add_add_bos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_BOS, value) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index cd1942975..a23136b18 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -29,7 +29,7 @@ class SpecialVocab: if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: @@ -152,10 +152,6 @@ class SpecialVocab: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry