diff --git a/CMakeLists.txt b/CMakeLists.txt index aa65b0d6c..362ab3673 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1281,17 +1281,6 @@ install( WORLD_READ WORLD_EXECUTE DESTINATION ${CMAKE_INSTALL_BINDIR}) -install( - FILES convert-lora-to-ggml.py - PERMISSIONS - OWNER_READ - OWNER_WRITE - OWNER_EXECUTE - GROUP_READ - GROUP_EXECUTE - WORLD_READ - WORLD_EXECUTE - DESTINATION ${CMAKE_INSTALL_BINDIR}) if (LLAMA_METAL) install( FILES ggml-metal.metal diff --git a/ci/run.sh b/ci/run.sh index e67c1a5ff..d5972480b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 { cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log - # lora - function compare_ppl { - qnt="$1" - ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - - if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then - printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" - return 20 - fi - - printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" - return 0 - } - - path_lora="../models-mnt/open-llama/3B-v2/lora" - path_shakespeare="../models-mnt/shakespeare" - - shakespeare="${path_shakespeare}/shakespeare.txt" - lora_shakespeare="${path_lora}/ggml-adapter-model.bin" - - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin - gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt - - python3 ../convert-lora-to-ggml.py ${path_lora} - - # f16 - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log - compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - - # q8_0 - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log - compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - - # q8_0 + f16 lora-base - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log - compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - set +e } @@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" - gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" - gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" - gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" - gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" - gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" - gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } # open_llama_7b_v2 @@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 { cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log - # lora - function compare_ppl { - qnt="$1" - ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - - if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then - printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" - return 20 - fi - - printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" - return 0 - } - - path_lora="../models-mnt/open-llama/7B-v2/lora" - path_shakespeare="../models-mnt/shakespeare" - - shakespeare="${path_shakespeare}/shakespeare.txt" - lora_shakespeare="${path_lora}/ggml-adapter-model.bin" - - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin - gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt - - python3 ../convert-lora-to-ggml.py ${path_lora} - - # f16 - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log - compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - - # currently not supported by the CUDA backend - # q8_0 - #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log - #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log - #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - - # q8_0 + f16 lora-base - #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log - #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - set +e } @@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" - gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" - gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" - gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" - #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" - #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" - #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } # bge-small diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py deleted file mode 100755 index f09fa85fe..000000000 --- a/convert-lora-to-ggml.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import logging -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any, BinaryIO, Sequence - -import numpy as np -import torch - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("lora-to-gguf") - -NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} - - -def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: - fout.write(b"ggla"[::-1]) # magic (ggml lora) - fout.write(struct.pack("i", 1)) # file version - fout.write(struct.pack("i", params["r"])) - # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int - # but some models ship a float value instead - # let's convert to int, but fail if lossless conversion is not possible - assert ( - int(params["lora_alpha"]) == params["lora_alpha"] - ), "cannot convert float to int losslessly" - fout.write(struct.pack("i", int(params["lora_alpha"]))) - - -def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None: - sname = name.encode("utf-8") - fout.write( - struct.pack( - "iii", - len(shape), - len(sname), - NUMPY_TYPE_TO_FTYPE[data_type.name], - ) - ) - fout.write(struct.pack("i" * len(shape), *shape[::-1])) - fout.write(sname) - fout.seek((fout.tell() + 31) & -32) - - -if __name__ == '__main__': - if len(sys.argv) < 2: - logger.info(f"Usage: python {sys.argv[0]} [arch]") - logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'") - logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") - sys.exit(1) - - input_json = os.path.join(sys.argv[1], "adapter_config.json") - input_model = os.path.join(sys.argv[1], "adapter_model.bin") - output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") - - if os.path.exists(input_model): - model = torch.load(input_model, map_location="cpu") - else: - input_model = os.path.join(sys.argv[1], "adapter_model.safetensors") - # lazy import load_file only if lora is in safetensors format. - from safetensors.torch import load_file - model = load_file(input_model, device="cpu") - - arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" - - if arch_name not in gguf.MODEL_ARCH_NAMES.values(): - logger.error(f"Error: unsupported architecture {arch_name}") - sys.exit(1) - - arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] - name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone - - with open(input_json, "r") as f: - params = json.load(f) - - if params["peft_type"] != "LORA": - logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") - sys.exit(1) - - if params["fan_in_fan_out"] is True: - logger.error("Error: param fan_in_fan_out is not supported") - sys.exit(1) - - if params["bias"] is not None and params["bias"] != "none": - logger.error("Error: param bias is not supported") - sys.exit(1) - - # TODO: these seem to be layers that have been trained but without lora. - # doesn't seem widely used but eventually should be supported - if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - logger.error("Error: param modules_to_save is not supported") - sys.exit(1) - - with open(output_path, "wb") as fout: - fout.truncate() - - write_file_header(fout, params) - for k, v in model.items(): - orig_k = k - if k.endswith(".default.weight"): - k = k.replace(".default.weight", ".weight") - if k in ["llama_proj.weight", "llama_proj.bias"]: - continue - if k.endswith("lora_A.weight"): - if v.dtype != torch.float16 and v.dtype != torch.float32: - v = v.float() - v = v.T - else: - v = v.float() - - t = v.detach().numpy() - - prefix = "base_model.model." - if k.startswith(prefix): - k = k[len(prefix) :] - - lora_suffixes = (".lora_A.weight", ".lora_B.weight") - if k.endswith(lora_suffixes): - suffix = k[-len(lora_suffixes[0]):] - k = k[: -len(lora_suffixes[0])] - else: - logger.error(f"Error: unrecognized tensor name {orig_k}") - sys.exit(1) - - tname = name_map.get_name(k) - if tname is None: - logger.error(f"Error: could not map tensor name {orig_k}") - logger.error(" Note: the arch parameter must be specified if the model is not llama") - sys.exit(1) - - if suffix == ".lora_A.weight": - tname += ".weight.loraA" - elif suffix == ".lora_B.weight": - tname += ".weight.loraB" - else: - assert False - - logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") - write_tensor_header(fout, tname, t.shape, t.dtype) - t.tofile(fout) - - logger.info(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/requirements.txt b/requirements.txt index fc1e28278..e7d14e16a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,4 @@ -r ./requirements/requirements-convert-hf-to-gguf.txt -r ./requirements/requirements-convert-hf-to-gguf-update.txt -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt --r ./requirements/requirements-convert-lora-to-ggml.txt -r ./requirements/requirements-convert-persimmon-to-gguf.txt diff --git a/requirements/requirements-convert-lora-to-ggml.txt b/requirements/requirements-convert-lora-to-ggml.txt deleted file mode 100644 index 6ac402610..000000000 --- a/requirements/requirements-convert-lora-to-ggml.txt +++ /dev/null @@ -1,2 +0,0 @@ --r ./requirements-convert.txt -torch~=2.1.1