convert_hf : reduce usages of UNKNOWN for InternLM2

This makes the changes from #8321 more consistent with the other changes made here.
2025-01-12 19:50:17 +00:00 · 2024-07-10 17:33:04 -04:00 · 2024-07-10 17:33:04 -04:00 · 1caa20fc7a
commit 1caa20fc7a
parent afa6119850
1 changed files with 3 additions and 3 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -2189,7 +2189,7 @@ class InternLM2Model(Model):
                toktype = SentencePieceTokenTypes.BYTE
            # take care of ununsed raw token
            if piece.startswith('[UNUSED'):
-                toktype = SentencePieceTokenTypes.UNKNOWN
+                toktype = SentencePieceTokenTypes.UNUSED

            tokens.append(text)
            scores.append(score)
@ -2219,7 +2219,7 @@ class InternLM2Model(Model):
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
-                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
@ -2238,7 +2238,7 @@ class InternLM2Model(Model):
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
-                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0