llama.cpp/convert-pth-to-ggml.py

# Convert a LLaMA model checkpoint to a ggml compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
#   - Number of dimensions (int)
#   - Name length (int)
#   - Dimensions (int[n_dims])
#   - Name (char[name_length])
#   - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#
import argparse
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor

def parse_args():

    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model', help='directory containing the model checkpoint')
    parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
    return parser.parse_args()

def get_n_parts(dim):

    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
        print(f"Invalid dim: {dim}")
        sys.exit(1)

    print(f"n_parts = {n_parts}\n")
    return n_parts

def load_hparams_and_tokenizer(dir_model):

    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{dir_model}/../tokenizer.model"

    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)

    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})

    return hparams, tokenizer

def write_header(fout, hparams, ftype):

    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
        0x67676d66,  # magic: ggml in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
        ftype
    ]
    fout.write(struct.pack("i" * len(values), *values))

def write_tokens(fout, tokenizer):

    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode("utf-8")
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

def process_and_write_variables(fout, model, ftype):

    for name, datao in model.items():

        if name.endswith("freqs"):
            continue

        shape = datao.shape

        print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")

        data = datao.numpy().squeeze()
        n_dims = len(shape)

        # default type is fp16
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0

        # header
        sname = name.encode('utf-8')
        fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
        for dim in reversed(data.shape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)

        # data output to file
        data.tofile(fout)

def main():

    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]

    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
    n_parts = get_n_parts(hparams["dim"])

    for p in range(n_parts):

        print(f"Processing part {p}\n")

        fname_model = f"{dir_model}/consolidated.0{p}.pth"
        fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"

        model = torch.load(fname_model, map_location="cpu")

        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
            process_and_write_variables(fout, model, ftype)

        del model
        print(f"Done. Output file: {fname_out}, (part {p})\n")

if __name__ == "__main__":
    main()
Initial release 2023-03-10 18:40:58 +00:00			`# Convert a LLaMA model checkpoint to a ggml compatible file`
			`#`
			`# Load the model using Torch`
			`# Iterate over all variables and write them to a binary file.`
			`#`
			`# For each variable, write the following:`
			`# - Number of dimensions (int)`
			`# - Name length (int)`
			`# - Dimensions (int[n_dims])`
			`# - Name (char[name_length])`
			`# - Data (float[n_dims])`
			`#`
			`# By default, the bigger matrices are converted to 16-bit floats.`
			`# This can be disabled by adding the "use-f32" CLI argument.`
			`#`
			`# At the start of the ggml file we write the model parameters`
			`# and vocabulary.`
			`#`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`import argparse`
Initial release 2023-03-10 18:40:58 +00:00			`import sys`
			`import json`
			`import struct`
			`import numpy as np`
			`import torch`
			`from sentencepiece import SentencePieceProcessor`

Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def parse_args():`
Initial release 2023-03-10 18:40:58 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')`
			`parser.add_argument('dir_model', help='directory containing the model checkpoint')`
			`parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')`
			`return parser.parse_args()`
Initial release 2023-03-10 18:40:58 +00:00
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00			`def get_n_parts(dim):`
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}`
			`n_parts = mappings.get(dim)`
			`if n_parts is None:`
			`print(f"Invalid dim: {dim}")`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00			`sys.exit(1)`

Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`print(f"n_parts = {n_parts}\n")`
			`return n_parts`
Initial release 2023-03-10 18:40:58 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def load_hparams_and_tokenizer(dir_model):`
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`fname_hparams = f"{dir_model}/params.json"`
			`fname_tokenizer = f"{dir_model}/../tokenizer.model"`
Initial release 2023-03-10 18:40:58 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`with open(fname_hparams, "r") as f:`
			`hparams = json.load(f)`
			`print(hparams)`
Initial release 2023-03-10 18:40:58 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`tokenizer = SentencePieceProcessor(fname_tokenizer)`
			`hparams.update({"vocab_size": tokenizer.vocab_size()})`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`return hparams, tokenizer`
Initial release 2023-03-10 18:40:58 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def write_header(fout, hparams, ftype):`
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]`
			`values = [`
sentencepiece bpe compatible tokenizer (#252) * potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> 2023-03-20 10:17:23 +00:00			`0x67676d66, # magic: ggml in hex`
			`1, # file version`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`*[hparams[key] for key in keys],`
			`hparams["dim"] // hparams["n_heads"], # rot (obsolete)`
			`ftype`
			`]`
			`fout.write(struct.pack("i" * len(values), *values))`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def write_tokens(fout, tokenizer):`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Use `tokenizer.vocab_size()` instead of hardcoding 32000 in convert-pth-to-ggml.py (#142) There are ways that special tokens or other new tokens could be added to the tokenizer; therefore it's probably best not to assume the vocabulary is only 32000 tokens. 2023-03-15 19:37:50 +00:00			`for i in range(tokenizer.vocab_size()):`
Fix UTF-8 handling (including colors) (#79) 2023-03-13 16:24:18 +00:00			`if tokenizer.is_unknown(i):`
			`text = " \u2047 ".encode("utf-8")`
			`elif tokenizer.is_control(i):`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`text = b""`
Fix UTF-8 handling (including colors) (#79) 2023-03-13 16:24:18 +00:00			`elif tokenizer.is_byte(i):`
			`piece = tokenizer.id_to_piece(i)`
			`if len(piece) != 6:`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`print(f"Invalid token: {piece}")`
Fix UTF-8 handling (including colors) (#79) 2023-03-13 16:24:18 +00:00			`sys.exit(1)`
			`byte_value = int(piece[3:-1], 16)`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`text = struct.pack("B", byte_value)`
Fix UTF-8 handling (including colors) (#79) 2023-03-13 16:24:18 +00:00			`else:`
			`text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`fout.write(struct.pack("i", len(text)))`
			`fout.write(text)`
sentencepiece bpe compatible tokenizer (#252) * potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> 2023-03-20 10:17:23 +00:00			`fout.write(struct.pack("f", tokenizer.get_score(i)))`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def process_and_write_variables(fout, model, ftype):`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00			`for name, datao in model.items():`

Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`if name.endswith("freqs"):`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00			`continue`

Fix python stuff (#109) 2023-03-19 17:33:18 +00:00			`shape = datao.shape`

			`print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")`

			`data = datao.numpy().squeeze()`
			`n_dims = len(shape)`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
			`# default type is fp16`
			`ftype_cur = 1`
			`if ftype == 0 or n_dims == 1:`
			`print(" Converting to float32")`
			`data = data.astype(np.float32)`
			`ftype_cur = 0`

			`# header`
			`sname = name.encode('utf-8')`
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))`
			`for dim in reversed(data.shape):`
			`fout.write(struct.pack("i", dim))`
			`fout.write(sname)`
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00
			`# data output to file`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00			`data.tofile(fout)`

Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`def main():`

			`args = parse_args()`
			`dir_model = args.dir_model`
			`ftype = args.ftype`
			`ftype_str = ["f32", "f16"]`

			`hparams, tokenizer = load_hparams_and_tokenizer(dir_model)`
			`n_parts = get_n_parts(hparams["dim"])`

			`for p in range(n_parts):`

			`print(f"Processing part {p}\n")`
Fix python stuff (#109) 2023-03-19 17:33:18 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`fname_model = f"{dir_model}/consolidated.0{p}.pth"`
			`fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"`

			`model = torch.load(fname_model, map_location="cpu")`

			`with open(fname_out, "wb") as fout:`
			`write_header(fout, hparams, ftype)`
			`write_tokens(fout, tokenizer)`
			`process_and_write_variables(fout, model, ftype)`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`del model`
			`print(f"Done. Output file: {fname_out}, (part {p})\n")`
Support all LLaMA models + change Q4_0 quantization storage 2023-03-11 08:47:09 +00:00
Refactoring `convert-pth-to-ggml.py`: more concise and readable (#109) * Refactor get_n_parts function to simplify code and improve readability * Use f-strings instead of concatenation * Refactoring: more concise and readable * modularize --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-19 17:17:39 +00:00			`if __name__ == "__main__":`
			`main()`