From 53635c081c49321d523567112f9fddfbba6b787b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 19:29:26 +0300 Subject: [PATCH] py : add GPT4All conversion script For now: copy-paste Too much time for me to deduplicate the python code --- convert-gpt4all-to-ggml.py | 107 ++++++++++++++++++++++++++++ convert-unversioned-ggml-to-ggml.py | 2 +- 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 convert-gpt4all-to-ggml.py diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py new file mode 100644 index 000000000..f1d9d7aef --- /dev/null +++ b/convert-gpt4all-to-ggml.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# +# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py +# + +# Original by https://github.com/eiz +# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 +import argparse +import glob +import os +import struct +import sys +from sentencepiece import SentencePieceProcessor + +HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + +def parse_args(): + parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') + parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') + parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + return parser.parse_args() + +def read_header(f_in): + struct_fmt = "i" * (3 + len(HPARAMS)) + struct_size = struct.calcsize(struct_fmt) + buf = f_in.read(struct_size) + return struct.unpack(struct_fmt, buf) + +def write_header(f_out, header): + (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header + + if magic != 0x67676d6c: + raise Exception('Invalid file magic. Must be an old style ggml file.') + + values = [ + 0x67676d66, # magic: ggml in hex + 1, # file version + vocab_size, + dim, + multiple_of, + n_heads, + n_layers, + rot, + ftype + ] + f_out.write(struct.pack("i" * len(values), *values)) + +def write_tokens(fout, tokenizer): + for i in range(tokenizer.vocab_size()): + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + elif tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + print(f"Invalid token: {piece}") + sys.exit(1) + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) + + # TODO: GPT4All - add extra token + text = "".encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", 0.0)) + +def read_tokens(f_in, tokenizer): + for i in range(tokenizer.vocab_size()): + len_b = f_in.read(4) + (length,) = struct.unpack("i", len_b) + f_in.read(length) + +def copy_all_data(f_out, f_in): + while True: + buf = f_in.read(1024 * 1024) + if not buf: + break + f_out.write(buf) + +def convert_one_file(path_in, tokenizer): + path_tmp = f"{path_in}.tmp" + path_orig= f"{path_in}.orig" + print(f"converting {path_in}") + with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: + write_header(f_out, read_header(f_in)) + read_tokens(f_in, tokenizer) + write_tokens(f_out, tokenizer) + copy_all_data(f_out, f_in) + os.rename(path_in, path_orig) + os.rename(path_tmp, path_in) + +def main(): + args = parse_args() + + tokenizer = SentencePieceProcessor(args.tokenizer_model) + + convert_one_file(args.gpt4all_model, tokenizer) + +if __name__ == "__main__": + main() diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py index 2457e3181..33b6243bd 100644 --- a/convert-unversioned-ggml-to-ggml.py +++ b/convert-unversioned-ggml-to-ggml.py @@ -27,7 +27,7 @@ def write_header(f_out, header): if magic != 0x67676d6c: raise Exception('Invalid file magic. Must be an old style ggml file.') - + values = [ 0x67676d66, # magic: ggml in hex 1, # file version