llama.cpp/quantize.py

#!/usr/bin/env python3

"""Script to execute the "quantize" script on a given set of models."""

import subprocess
import argparse
import glob
import sys
import os


def main():
    """Update the quantize binary name depending on the platform and parse
    the command line arguments and execute the script.
    """

    if "linux" in sys.platform or "darwin" in sys.platform:
        quantize_script_binary = "quantize"

    elif "win32" in sys.platform or "cygwin" in sys.platform:
        quantize_script_binary = "quantize.exe"

    else:
        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
        quantize_script_binary = "quantize"

    parser = argparse.ArgumentParser(
        prog='python3 quantize.py',
        description='This script quantizes the given models by applying the '
        f'"{quantize_script_binary}" script on them.'
    )
    parser.add_argument(
        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
        help='The models to quantize.'
    )
    parser.add_argument(
        '-r', '--remove-16', action='store_true', dest='remove_f16',
        help='Remove the f16 model after quantizing it.'
    )
    parser.add_argument(
        '-m', '--models-path', dest='models_path',
        default=os.path.join(os.getcwd(), "models"),
        help='Specify the directory where the models are located.'
    )
    parser.add_argument(
        '-q', '--quantize-script-path', dest='quantize_script_path',
        default=os.path.join(os.getcwd(), quantize_script_binary),
        help='Specify the path to the "quantize" script.'
    )

    # TODO: Revise this code
    # parser.add_argument(
    #     '-t', '--threads', dest='threads', type='int',
    #     default=os.cpu_count(),
    #     help='Specify the number of threads to use to quantize many models at '
    #     'once. Defaults to os.cpu_count().'
    # )

    args = parser.parse_args()

    if not os.path.isfile(args.quantize_script_path):
        print(
            f'The "{quantize_script_binary}" script was not found in the '
            "current location.\nIf you want to use it from another location, "
            "set the --quantize-script-path argument from the command line."
        )
        sys.exit(1)

    for model in args.models:
        # The model is separated in various parts
        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
        f16_model_path_base = os.path.join(
            args.models_path, model, "ggml-model-f16.bin"
        )

        f16_model_parts_paths = map(
            lambda filename: os.path.join(f16_model_path_base, filename),
            glob.glob(f"{f16_model_path_base}*")
        )

        for f16_model_part_path in f16_model_parts_paths:
            if not os.path.isfile(f16_model_part_path):
                print(
                    f"The f16 model {os.path.basename(f16_model_part_path)} "
                    f"was not found in {args.models_path}{os.path.sep}{model}"
                    ". If you want to use it from another location, set the "
                    "--models-path argument from the command line."
                )
                sys.exit(1)

            __run_quantize_script(
                args.quantize_script_path, f16_model_part_path
            )

            if args.remove_f16:
                os.remove(f16_model_part_path)


# This was extracted to a top-level function for parallelization, if
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406

def __run_quantize_script(script_path, f16_model_part_path):
    """Run the quantize script specifying the path to it and the path to the
    f16 model to quantize.
    """

    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
    subprocess.run(
        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
        check=True
    )


if __name__ == "__main__":
    try:
        main()

    except subprocess.CalledProcessError:
        print("\nAn error ocurred while trying to quantize the models.")
        sys.exit(1)

    except KeyboardInterrupt:
        sys.exit(0)

    else:
        print("\nSuccesfully quantized all models.")
Improved quantize script (#222) * Improved quantize script I improved the quantize script by adding error handling and allowing to select many models for quantization at once in the command line. I also converted it to Python for generalization as well as extensibility. * Fixes and improvements based on Matt's observations Fixed and improved many things in the script based on the reviews made by @mattsta. The parallelization suggestion is still to be revised, but code for it was still added (commented). * Small fixes to the previous commit * Corrected to use the original glob pattern The original Bash script uses a glob pattern to match files that have endings such as ...bin.0, ...bin.1, etc. That has been translated correctly to Python now. * Added support for Windows and updated README to use this script New code to set the name of the quantize script binary depending on the platform has been added (quantize.exe if working on Windows) and the README.md file has been updated to use this script instead of the Bash one. * Fixed a typo and removed shell=True in the subprocess.run call Fixed a typo regarding the new filenames of the quantized models and removed the shell=True parameter in the subprocess.run call as it was conflicting with the list of parameters. * Corrected previous commit * Small tweak: changed the name of the program in argparse This was making the automatic help message to be suggesting the program's usage as being literally "$ Quantization Script [arguments]". It should now be something like "$ python3 quantize.py [arguments]". 2023-03-19 18:38:44 +00:00			`#!/usr/bin/env python3`

			`"""Script to execute the "quantize" script on a given set of models."""`

			`import subprocess`
			`import argparse`
			`import glob`
			`import sys`
			`import os`


			`def main():`
			`"""Update the quantize binary name depending on the platform and parse`
			`the command line arguments and execute the script.`
			`"""`

			`if "linux" in sys.platform or "darwin" in sys.platform:`
			`quantize_script_binary = "quantize"`

			`elif "win32" in sys.platform or "cygwin" in sys.platform:`
			`quantize_script_binary = "quantize.exe"`

			`else:`
			`print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")`
			`quantize_script_binary = "quantize"`

			`parser = argparse.ArgumentParser(`
			`prog='python3 quantize.py',`
			`description='This script quantizes the given models by applying the '`
			`f'"{quantize_script_binary}" script on them.'`
			`)`
			`parser.add_argument(`
			`'models', nargs='+', choices=('7B', '13B', '30B', '65B'),`
			`help='The models to quantize.'`
			`)`
			`parser.add_argument(`
			`'-r', '--remove-16', action='store_true', dest='remove_f16',`
			`help='Remove the f16 model after quantizing it.'`
			`)`
			`parser.add_argument(`
			`'-m', '--models-path', dest='models_path',`
			`default=os.path.join(os.getcwd(), "models"),`
			`help='Specify the directory where the models are located.'`
			`)`
			`parser.add_argument(`
			`'-q', '--quantize-script-path', dest='quantize_script_path',`
			`default=os.path.join(os.getcwd(), quantize_script_binary),`
			`help='Specify the path to the "quantize" script.'`
			`)`

			`# TODO: Revise this code`
			`# parser.add_argument(`
			`# '-t', '--threads', dest='threads', type='int',`
			`# default=os.cpu_count(),`
			`# help='Specify the number of threads to use to quantize many models at '`
			`# 'once. Defaults to os.cpu_count().'`
			`# )`

			`args = parser.parse_args()`

			`if not os.path.isfile(args.quantize_script_path):`
			`print(`
			`f'The "{quantize_script_binary}" script was not found in the '`
			`"current location.\nIf you want to use it from another location, "`
			`"set the --quantize-script-path argument from the command line."`
			`)`
			`sys.exit(1)`

			`for model in args.models:`
			`# The model is separated in various parts`
			`# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)`
			`f16_model_path_base = os.path.join(`
			`args.models_path, model, "ggml-model-f16.bin"`
			`)`

			`f16_model_parts_paths = map(`
			`lambda filename: os.path.join(f16_model_path_base, filename),`
			`glob.glob(f"{f16_model_path_base}*")`
			`)`

			`for f16_model_part_path in f16_model_parts_paths:`
			`if not os.path.isfile(f16_model_part_path):`
			`print(`
			`f"The f16 model {os.path.basename(f16_model_part_path)} "`
			`f"was not found in {args.models_path}{os.path.sep}{model}"`
			`". If you want to use it from another location, set the "`
			`"--models-path argument from the command line."`
			`)`
			`sys.exit(1)`

			`__run_quantize_script(`
			`args.quantize_script_path, f16_model_part_path`
			`)`

			`if args.remove_f16:`
			`os.remove(f16_model_part_path)`


			`# This was extracted to a top-level function for parallelization, if`
			`# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406`

			`def __run_quantize_script(script_path, f16_model_part_path):`
			`"""Run the quantize script specifying the path to it and the path to the`
			`f16 model to quantize.`
			`"""`

			`new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")`
			`subprocess.run(`
			`[script_path, f16_model_part_path, new_quantized_model_path, "2"],`
			`check=True`
			`)`


			`if __name__ == "__main__":`
			`try:`
			`main()`

			`except subprocess.CalledProcessError:`
			`print("\nAn error ocurred while trying to quantize the models.")`
			`sys.exit(1)`

			`except KeyboardInterrupt:`
			`sys.exit(0)`

			`else:`
			`print("\nSuccesfully quantized all models.")`