mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
py : convert-hf-to-gguf-update improvements (#7340)
* convert-hf-to-gguf-update: automate updating * convert-hf-to-gguf-update: improve download * share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors
This commit is contained in:
parent
27b040691c
commit
d273c1402b
@ -20,11 +20,13 @@
|
|||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
# TODO: automate the update of convert-hf-to-gguf.py
|
|
||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||||
|
sess = requests.Session()
|
||||||
|
|
||||||
|
|
||||||
class TOKENIZER_TYPE(IntEnum):
|
class TOKENIZER_TYPE(IntEnum):
|
||||||
@ -79,63 +82,44 @@ models = [
|
|||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
|
||||||
if not os.path.exists("models/tokenizers"):
|
|
||||||
os.makedirs("models/tokenizers")
|
|
||||||
|
|
||||||
|
|
||||||
def download_file_with_auth(url, token, save_path):
|
def download_file_with_auth(url, token, save_path):
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
response = requests.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
if response.status_code == 200:
|
response.raise_for_status()
|
||||||
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
else:
|
|
||||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
|
||||||
|
|
||||||
|
|
||||||
# download the tokenizer models
|
def download_model(model):
|
||||||
for model in models:
|
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
repo = model["repo"]
|
repo = model["repo"]
|
||||||
tokt = model["tokt"]
|
tokt = model["tokt"]
|
||||||
|
|
||||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||||
os.makedirs(f"models/tokenizers/{name}")
|
|
||||||
else:
|
|
||||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
|
||||||
|
|
||||||
url = f"{repo}/raw/main/config.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/config.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
url = f"{repo}/raw/main/tokenizer.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
|
||||||
if os.path.getsize(save_path) < 1024:
|
|
||||||
# remove the file
|
|
||||||
os.remove(save_path)
|
|
||||||
url = f"{repo}/resolve/main/tokenizer.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
|
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||||
if tokt == TOKENIZER_TYPE.SPM:
|
if tokt == TOKENIZER_TYPE.SPM:
|
||||||
url = f"{repo}/resolve/main/tokenizer.model"
|
files.append("tokenizer.model")
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
for file in files:
|
||||||
|
save_path = f"models/tokenizers/{name}/{file}"
|
||||||
|
if os.path.isfile(save_path):
|
||||||
|
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||||
|
continue
|
||||||
|
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||||
|
|
||||||
|
|
||||||
|
for model in models:
|
||||||
|
try:
|
||||||
|
download_model(model)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||||
|
|
||||||
url = f"{repo}/raw/main/tokenizer_config.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
|
||||||
|
|
||||||
src_ifs = ""
|
src_ifs = ""
|
||||||
for model in models:
|
for model in models:
|
||||||
@ -224,11 +208,18 @@ src_func = f"""
|
|||||||
return res
|
return res
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print(src_func) # noqa: NP100
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
|
convert_py = convert_py_pth.read_text()
|
||||||
|
convert_py = re.sub(
|
||||||
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
convert_py,
|
||||||
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("\n")
|
convert_py_pth.write_text(convert_py)
|
||||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
|
||||||
logger.info("\n")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
# generate tests for each tokenizer model
|
# generate tests for each tokenizer model
|
||||||
|
|
||||||
|
@ -402,6 +402,7 @@ class Model:
|
|||||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||||
# do not modify it manually!
|
# do not modify it manually!
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
|
# Marker: Start get_vocab_base_pre
|
||||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
@ -489,6 +490,7 @@ class Model:
|
|||||||
logger.debug(f"chkhsh: {chkhsh}")
|
logger.debug(f"chkhsh: {chkhsh}")
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
# Marker: End get_vocab_base_pre
|
||||||
|
|
||||||
def _set_vocab_gpt2(self) -> None:
|
def _set_vocab_gpt2(self) -> None:
|
||||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
Loading…
Reference in New Issue
Block a user