mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
Use tokenizer.vocab_size()
instead of hardcoding 32000 in convert-pth-to-ggml.py (#142)
There are ways that special tokens or other new tokens could be added to the tokenizer; therefore it's probably best not to assume the vocabulary is only 32000 tokens.
This commit is contained in:
parent
113e685d18
commit
956dfda8ad
@ -99,7 +99,7 @@ for p in range(n_parts):
|
|||||||
fout.write(struct.pack("i", ftype))
|
fout.write(struct.pack("i", ftype))
|
||||||
|
|
||||||
# Is this correct??
|
# Is this correct??
|
||||||
for i in range(32000):
|
for i in range(tokenizer.vocab_size()):
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
# "<unk>" token (translated as ??)
|
# "<unk>" token (translated as ??)
|
||||||
text = " \u2047 ".encode("utf-8")
|
text = " \u2047 ".encode("utf-8")
|
||||||
|
Loading…
Reference in New Issue
Block a user