llama.cpp/convert-lora-to-ggml.py

#!/usr/bin/env python3
from __future__ import annotations

import json
import os
import struct
import sys
from typing import Any, BinaryIO, Sequence

import numpy as np
import torch

from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf


NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
    fout.write(struct.pack("i", params["r"]))
    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
    # but some models ship a float value instead
    # let's convert to int, but fail if lossless conversion is not possible
    assert (
        int(params["lora_alpha"]) == params["lora_alpha"]
    ), "cannot convert float to int losslessly"
    fout.write(struct.pack("i", int(params["lora_alpha"])))


def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
            "iii",
            len(shape),
            len(sname),
            NUMPY_TYPE_TO_FTYPE[data_type.name],
        )
    )
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
    fout.seek((fout.tell() + 31) & -32)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(f"Usage: python {sys.argv[0]} <path> [arch]")
        print(
            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
        )
        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
        sys.exit(1)

    input_json = os.path.join(sys.argv[1], "adapter_config.json")
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

    model = torch.load(input_model, map_location="cpu")
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"

    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
        print(f"Error: unsupported architecture {arch_name}")
        sys.exit(1)

    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone

    with open(input_json, "r") as f:
        params = json.load(f)

    if params["peft_type"] != "LORA":
        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
        sys.exit(1)

    if params["fan_in_fan_out"] is True:
        print("Error: param fan_in_fan_out is not supported")
        sys.exit(1)

    if params["bias"] is not None and params["bias"] != "none":
        print("Error: param bias is not supported")
        sys.exit(1)

    # TODO: these seem to be layers that have been trained but without lora.
    # doesn't seem widely used but eventually should be supported
    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
        print("Error: param modules_to_save is not supported")
        sys.exit(1)

    with open(output_path, "wb") as fout:
        fout.truncate()

        write_file_header(fout, params)
        for k, v in model.items():
            orig_k = k
            if k.endswith(".default.weight"):
                k = k.replace(".default.weight", ".weight")
            if k in ["llama_proj.weight", "llama_proj.bias"]:
                continue
            if k.endswith("lora_A.weight"):
                if v.dtype != torch.float16 and v.dtype != torch.float32:
                    v = v.float()
                v = v.T
            else:
                v = v.float()

            t = v.detach().numpy()

            prefix = "base_model.model."
            if k.startswith(prefix):
                k = k[len(prefix) :]

            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
            if k.endswith(lora_suffixes):
                suffix = k[-len(lora_suffixes[0]):]
                k = k[: -len(lora_suffixes[0])]
            else:
                print(f"Error: unrecognized tensor name {orig_k}")
                sys.exit(1)

            tname = name_map.get_name(k)
            if tname is None:
                print(f"Error: could not map tensor name {orig_k}")
                print(" Note: the arch parameter must be specified if the model is not llama")
                sys.exit(1)

            if suffix == ".lora_A.weight":
                tname += ".weight.loraA"
            elif suffix == ".lora_B.weight":
                tname += ".weight.loraB"
            else:
                assert False

            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
            write_tensor_header(fout, tname, t.shape, t.dtype)
            t.tofile(fout)

    print(f"Converted {input_json} and {input_model} to {output_path}")
chmod : make scripts executable (#2675) 2023-08-23 14:29:09 +00:00			`#!/usr/bin/env python3`
convert : fix python 3.8 support, modernize type annotations (#2916) * convert : fix python 3.8 support * convert : sort imports * convert : fix required parameters in convert-llama-ggmlv3-to-gguf * convert : fix mypy errors in convert-llama-ggmlv3-to-gguf * convert : use PEP 585 generics and PEP 604 unions Now that we have `from __future__ import annotations`, we can use this modern syntax in Python 3.7 instead of restricting support to Python 3.9 or 3.10 respectively. * gguf.py : a tuple is already a tuple * add mypy.ini * convert : add necessary `type: ignore` comments * gguf-py: bump version 2023-08-31 05:02:23 +00:00			`from __future__ import annotations`

Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`import json`
			`import os`
			`import struct`
			`import sys`
convert : fix python 3.8 support, modernize type annotations (#2916) * convert : fix python 3.8 support * convert : sort imports * convert : fix required parameters in convert-llama-ggmlv3-to-gguf * convert : fix mypy errors in convert-llama-ggmlv3-to-gguf * convert : use PEP 585 generics and PEP 604 unions Now that we have `from __future__ import annotations`, we can use this modern syntax in Python 3.7 instead of restricting support to Python 3.9 or 3.10 respectively. * gguf.py : a tuple is already a tuple * add mypy.ini * convert : add necessary `type: ignore` comments * gguf-py: bump version 2023-08-31 05:02:23 +00:00			`from typing import Any, BinaryIO, Sequence`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00
fix convert-lora-to-ggml.py (#2738) 2023-08-23 14:46:54 +00:00			`import numpy as np`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`import torch`

lora : add support for non-llama models (#3333) * lora : add support for non-llama models ggml-ci * avoid leaking ggml_context on failure cleanup ggml-ci * lora : allow 1d tensors * lora : include embd and output layers in size calculation * fix style 2023-12-16 17:58:46 +00:00			`from pathlib import Path`
			`if 'NO_LOCAL_GGUF' not in os.environ:`
			`sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))`
			`import gguf`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00

lora : add support for non-llama models (#3333) * lora : add support for non-llama models ggml-ci * avoid leaking ggml_context on failure cleanup ggml-ci * lora : allow 1d tensors * lora : include embd and output layers in size calculation * fix style 2023-12-16 17:58:46 +00:00			`NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00

convert : fix python 3.8 support, modernize type annotations (#2916) * convert : fix python 3.8 support * convert : sort imports * convert : fix required parameters in convert-llama-ggmlv3-to-gguf * convert : fix mypy errors in convert-llama-ggmlv3-to-gguf * convert : use PEP 585 generics and PEP 604 unions Now that we have `from __future__ import annotations`, we can use this modern syntax in Python 3.7 instead of restricting support to Python 3.9 or 3.10 respectively. * gguf.py : a tuple is already a tuple * add mypy.ini * convert : add necessary `type: ignore` comments * gguf-py: bump version 2023-08-31 05:02:23 +00:00			`def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`fout.write(b"ggla"[::-1]) # magic (ggml lora)`
			`fout.write(struct.pack("i", 1)) # file version`
py : cast lora_alpha to int in convert-lora-to-ggml (#1170) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> 2023-04-25 21:33:08 +00:00			`fout.write(struct.pack("i", params["r"]))`
			# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
			`# but some models ship a float value instead`
			`# let's convert to int, but fail if lossless conversion is not possible`
fix convert-lora-to-ggml.py (#2738) 2023-08-23 14:46:54 +00:00			`assert (`
			`int(params["lora_alpha"]) == params["lora_alpha"]`
			`), "cannot convert float to int losslessly"`
py : cast lora_alpha to int in convert-lora-to-ggml (#1170) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> 2023-04-25 21:33:08 +00:00			`fout.write(struct.pack("i", int(params["lora_alpha"])))`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00

lora : add support for non-llama models (#3333) * lora : add support for non-llama models ggml-ci * avoid leaking ggml_context on failure cleanup ggml-ci * lora : allow 1d tensors * lora : include embd and output layers in size calculation * fix style 2023-12-16 17:58:46 +00:00			`def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`sname = name.encode("utf-8")`
			`fout.write(`
			`struct.pack(`
			`"iii",`
			`len(shape),`
			`len(sname),`
fix convert-lora-to-ggml.py (#2738) 2023-08-23 14:46:54 +00:00			`NUMPY_TYPE_TO_FTYPE[data_type.name],`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`)`
			`)`
			`fout.write(struct.pack("i" * len(shape), *shape[::-1]))`
			`fout.write(sname)`
			`fout.seek((fout.tell() + 31) & -32)`


python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 14:50:29 +00:00			`if __name__ == '__main__':`
			`if len(sys.argv) < 2:`
			`print(f"Usage: python {sys.argv[0]} <path> [arch]")`
			`print(`
			`"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"`
			`)`
			`print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")`
			`sys.exit(1)`

			`input_json = os.path.join(sys.argv[1], "adapter_config.json")`
			`input_model = os.path.join(sys.argv[1], "adapter_model.bin")`
			`output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")`

			`model = torch.load(input_model, map_location="cpu")`
			`arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"`

			`if arch_name not in gguf.MODEL_ARCH_NAMES.values():`
			`print(f"Error: unsupported architecture {arch_name}")`
			`sys.exit(1)`

			`arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]`
			`name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone`

			`with open(input_json, "r") as f:`
			`params = json.load(f)`

			`if params["peft_type"] != "LORA":`
			`print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")`
			`sys.exit(1)`

			`if params["fan_in_fan_out"] is True:`
			`print("Error: param fan_in_fan_out is not supported")`
			`sys.exit(1)`

			`if params["bias"] is not None and params["bias"] != "none":`
			`print("Error: param bias is not supported")`
			`sys.exit(1)`

			`# TODO: these seem to be layers that have been trained but without lora.`
			`# doesn't seem widely used but eventually should be supported`
			`if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:`
			`print("Error: param modules_to_save is not supported")`
			`sys.exit(1)`

			`with open(output_path, "wb") as fout:`
			`fout.truncate()`

			`write_file_header(fout, params)`
			`for k, v in model.items():`
			`orig_k = k`
			`if k.endswith(".default.weight"):`
			`k = k.replace(".default.weight", ".weight")`
			`if k in ["llama_proj.weight", "llama_proj.bias"]:`
			`continue`
			`if k.endswith("lora_A.weight"):`
			`if v.dtype != torch.float16 and v.dtype != torch.float32:`
			`v = v.float()`
			`v = v.T`
			`else:`
Add LoRA support (#820) 2023-04-17 15:28:55 +00:00			`v = v.float()`
python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 14:50:29 +00:00
			`t = v.detach().numpy()`

			`prefix = "base_model.model."`
			`if k.startswith(prefix):`
			`k = k[len(prefix) :]`

			`lora_suffixes = (".lora_A.weight", ".lora_B.weight")`
			`if k.endswith(lora_suffixes):`
			`suffix = k[-len(lora_suffixes[0]):]`
			`k = k[: -len(lora_suffixes[0])]`
			`else:`
			`print(f"Error: unrecognized tensor name {orig_k}")`
			`sys.exit(1)`

			`tname = name_map.get_name(k)`
			`if tname is None:`
			`print(f"Error: could not map tensor name {orig_k}")`
			`print(" Note: the arch parameter must be specified if the model is not llama")`
			`sys.exit(1)`

			`if suffix == ".lora_A.weight":`
			`tname += ".weight.loraA"`
			`elif suffix == ".lora_B.weight":`
			`tname += ".weight.loraB"`
			`else:`
			`assert False`

			`print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")`
			`write_tensor_header(fout, tname, t.shape, t.dtype)`
			`t.tofile(fout)`

			`print(f"Converted {input_json} and {input_model} to {output_path}")`