mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
convert : handle tokenizer merges format from transformers 4.45 (#9696)
This commit is contained in:
parent
841713e1e4
commit
e3c355ba65
@ -122,8 +122,30 @@ class SpecialVocab:
|
|||||||
tokenizer = json.load(f)
|
tokenizer = json.load(f)
|
||||||
if self.load_merges:
|
if self.load_merges:
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
if isinstance(merges, list) and merges:
|
||||||
self.merges = merges
|
if isinstance(merges[0], str):
|
||||||
|
self.merges = merges
|
||||||
|
elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
|
||||||
|
# New format since transformers 4.45 to support spaces in merges
|
||||||
|
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
|
||||||
|
# TODO: internally store as the new format instead of converting to old
|
||||||
|
if any(' ' in s for pair in merges for s in pair):
|
||||||
|
logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
|
||||||
|
self.merges = [
|
||||||
|
' '.join(
|
||||||
|
[
|
||||||
|
# ensure the spaces are properly encoded
|
||||||
|
''.join(
|
||||||
|
chr(ord(c) + 256) if c == ' ' else c
|
||||||
|
for c in part
|
||||||
|
)
|
||||||
|
for part in pair
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for pair in merges
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown tokenizer merges format")
|
||||||
added_tokens = tokenizer.get('added_tokens', {})
|
added_tokens = tokenizer.get('added_tokens', {})
|
||||||
else:
|
else:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
Loading…
Reference in New Issue
Block a user