py : convert-hf-to-gguf-update improvements (#7340)

* convert-hf-to-gguf-update: automate updating * convert-hf-to-gguf-update: improve download * share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors
2024-12-26 03:14:35 +00:00 · 2024-05-17 15:11:45 +03:00 · 2024-05-17 15:11:45 +03:00 · d273c1402b
commit d273c1402b
parent 27b040691c
2 changed files with 41 additions and 48 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -20,11 +20,13 @@
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 import logging
 import os
 import pathlib
 import re
 import requests
 import sys
 import json
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert-hf-to-gguf-update")
 sess = requests.Session()
 class TOKENIZER_TYPE(IntEnum):
@ -79,63 +82,44 @@ models = [
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
 ]
 # make directory "models/tokenizers" if it doesn't exist
 if not os.path.exists("models/tokenizers"):
    os.makedirs("models/tokenizers")
 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
-    response = requests.get(url, headers=headers)
+    response = sess.get(url, headers=headers)
-    if response.status_code == 200:
+    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'wb') as f:
        f.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")
    else:
        logger.info(f"Failed to download file. Status code: {response.status_code}")
-# download the tokenizer models
+def download_model(model):
 for model in models:
    name = model["name"]
    repo = model["repo"]
    tokt = model["tokt"]
-    if not os.path.exists(f"models/tokenizers/{name}"):
+    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
        os.makedirs(f"models/tokenizers/{name}")
    else:
        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
        continue
    logger.info(f"Downloading {name} to models/tokenizers/{name}")
    url = f"{repo}/raw/main/config.json"
    save_path = f"models/tokenizers/{name}/config.json"
    download_file_with_auth(url, token, save_path)
    url = f"{repo}/raw/main/tokenizer.json"
    save_path = f"models/tokenizers/{name}/tokenizer.json"
    download_file_with_auth(url, token, save_path)
    # if downloaded file is less than 1KB, we likely need to download an LFS instead
    if os.path.getsize(save_path) < 1024:
        # remove the file
        os.remove(save_path)
        url = f"{repo}/resolve/main/tokenizer.json"
        save_path = f"models/tokenizers/{name}/tokenizer.json"
        download_file_with_auth(url, token, save_path)
    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
    if tokt == TOKENIZER_TYPE.SPM:
-        url = f"{repo}/resolve/main/tokenizer.model"
+        files.append("tokenizer.model")
-        save_path = f"models/tokenizers/{name}/tokenizer.model"
+
-        download_file_with_auth(url, token, save_path)
+    for file in files:
        save_path = f"models/tokenizers/{name}/{file}"
        if os.path.isfile(save_path):
            logger.info(f"{name}: File {save_path} already exists - skipping")
            continue
        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
 for model in models:
    try:
        download_model(model)
    except Exception as e:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")
    url = f"{repo}/raw/main/tokenizer_config.json"
    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
    download_file_with_auth(url, token, save_path)
 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
 # TODO: auto-update convert-hf-to-gguf.py with the generated function
 src_ifs = ""
 for model in models:
@ -224,11 +208,18 @@ src_func = f"""
        return res
 """
-print(src_func) # noqa: NP100
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
 convert_py = convert_py_pth.read_text()
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
    convert_py,
    flags=re.DOTALL | re.MULTILINE,
 )
-logger.info("\n")
+convert_py_pth.write_text(convert_py)
-logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+
-logger.info("\n")
+logger.info("+++ convert-hf-to-gguf.py was updated")
 # generate tests for each tokenizer model
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -402,6 +402,7 @@ class Model:
    # NOTE: this function is generated by convert-hf-to-gguf-update.py
    #       do not modify it manually!
    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
@ -489,6 +490,7 @@ class Model:
        logger.debug(f"chkhsh: {chkhsh}")
        return res
        # Marker: End get_vocab_base_pre
    def _set_vocab_gpt2(self) -> None:
        tokens, toktypes, tokpre = self.get_vocab_base()