mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
Improve handling of special tokens in GGML to GGUF converter (#2725)
* Improve UNK, BOS, EOS token handling when converting without metadata. * Allow importing as a module. * Remove some obsolete code and minor cleanups. * Set default UNK token mapping from -1 to 0 in llama.cpp * Try to handle overflow due to buggy Windows Python with a better error message
This commit is contained in:
parent
46ef5b5fcf
commit
777f42ba18
@ -1,10 +1,12 @@
|
|||||||
import sys, struct, math, argparse
|
import sys, struct, math, argparse, warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
warnings.filterwarnings('error')
|
||||||
|
|
||||||
# Note: Does not support GGML_QKK_64
|
# Note: Does not support GGML_QKK_64
|
||||||
QK_K = 256
|
QK_K = 256
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
@ -215,15 +217,10 @@ class GGMLToGGUF:
|
|||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
print('* Adding vocab item(s)')
|
print('* Adding vocab item(s)')
|
||||||
for (idx, vitem) in enumerate(vo.all_tokens()):
|
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
if len(vitem) == 3:
|
tokens.append(vbytes)
|
||||||
tokens.append(vitem[0])
|
scores.append(score)
|
||||||
scores.append(vitem[1])
|
toktypes.append(ttype)
|
||||||
toktypes.append(vitem[2])
|
|
||||||
else:
|
|
||||||
# Maybe try to guess the token type here?
|
|
||||||
tokens.append(vitem[0])
|
|
||||||
scores.append(vitem[1])
|
|
||||||
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
@ -231,9 +228,21 @@ class GGMLToGGUF:
|
|||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
return
|
return
|
||||||
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||||
|
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
||||||
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||||
tt = 1 # Normal
|
tt = 1 # Normal
|
||||||
if len(vbytes) == 0:
|
# Special handling for UNK, BOS, EOS tokens.
|
||||||
|
if tokid <= 2:
|
||||||
|
if tokid == 0:
|
||||||
|
vbytes = b'<unk>'
|
||||||
|
tt = 2
|
||||||
|
elif tokid == 1:
|
||||||
|
vbytes = b'<s>'
|
||||||
|
tt = 3
|
||||||
|
else:
|
||||||
|
vbytes = b'</s>'
|
||||||
|
tt = 3
|
||||||
|
elif len(vbytes) == 0:
|
||||||
tt = 3 # Control
|
tt = 3 # Control
|
||||||
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
||||||
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
||||||
@ -246,6 +255,9 @@ class GGMLToGGUF:
|
|||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
gguf_writer.add_unk_token_id(0)
|
||||||
|
gguf_writer.add_bos_token_id(1)
|
||||||
|
gguf_writer.add_eos_token_id(2)
|
||||||
|
|
||||||
def add_tensors(self, gguf_writer):
|
def add_tensors(self, gguf_writer):
|
||||||
nm = self.name_map
|
nm = self.name_map
|
||||||
@ -315,7 +327,11 @@ def main():
|
|||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLV3Model()
|
model = GGMLV3Model()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
try:
|
||||||
|
offset = model.load(data, 0)
|
||||||
|
except OverflowError:
|
||||||
|
print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
|
||||||
|
raise
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
@ -330,4 +346,5 @@ def main():
|
|||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
main()
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
@ -703,7 +703,7 @@ struct llama_vocab {
|
|||||||
// default LLaMA special tokens
|
// default LLaMA special tokens
|
||||||
id special_bos_id = 1;
|
id special_bos_id = 1;
|
||||||
id special_eos_id = 2;
|
id special_eos_id = 2;
|
||||||
id special_unk_id = -1;
|
id special_unk_id = 0;
|
||||||
id special_sep_id = -1;
|
id special_sep_id = -1;
|
||||||
id special_pad_id = -1;
|
id special_pad_id = -1;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user